diff --git a/config/arch.aarch64 b/config/arch.aarch64 index 866b7e69ba..eac9e22b8c 100644 --- a/config/arch.aarch64 +++ b/config/arch.aarch64 @@ -11,7 +11,7 @@ fi # TARGET_CPU: -# generic cortex-a35 cortex-a53 cortex-a57 cortex-a72 +# generic cortex-a35 cortex-a53 cortex-a57 cortex-a72 cortex-a76 # exynos-m1 qdf24xx thunderx xgene1 cortex-a57.cortex-a53 # cortex-a72.cortex-a53 @@ -21,6 +21,10 @@ TARGET_SUBARCH=aarch64 TARGET_VARIANT=armv8-a ;; + cortex-a76) + TARGET_SUBARCH=aarch64 + TARGET_VARIANT=armv8.2-a + ;; esac TARGET_GCC_ARCH=${TARGET_SUBARCH/-} diff --git a/config/noobs/partition_setup.sh b/config/noobs/partition_setup.sh index 36e8cb6e92..c0dd86349a 100755 --- a/config/noobs/partition_setup.sh +++ b/config/noobs/partition_setup.sh @@ -66,7 +66,7 @@ fi # create bootloader configuration echo "creating bootloader configuration..." - echo "boot=$id1 disk=$id2 quiet" > $MOUNTPOINT/cmdline.txt + echo "boot=$id1 disk=$id2 quiet @EXTRA_CMDLINE@" > $MOUNTPOINT/cmdline.txt # cleanup mountpoint umount $MOUNTPOINT diff --git a/licenses/FLIRC.txt b/licenses/FLIRC.txt new file mode 100644 index 0000000000..d135582636 --- /dev/null +++ b/licenses/FLIRC.txt @@ -0,0 +1,32 @@ +/* + * Copyright 2023 Flirc Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY FLIRC INC. \`\`AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ${AUTHOR_UPPER} OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * The views and conclusions contained in the software and documentation + * are those of the authors and should not be interpreted as representing + * official policies, either expressed or implied, of Flirc Inc. + */ diff --git a/licenses/HIDAPI-orig.txt b/licenses/HIDAPI-orig.txt new file mode 100644 index 0000000000..e3f3380829 --- /dev/null +++ b/licenses/HIDAPI-orig.txt @@ -0,0 +1,9 @@ + HIDAPI - Multi-Platform library for + communication with HID devices. + + Copyright 2009, Alan Ott, Signal 11 Software. + All Rights Reserved. + + This software may be used by anyone for any reason so + long as the copyright notice in the source files + remains intact. diff --git a/packages/addons/addon-depends/dotnet-runtime-depends/aspnet6-runtime/package.mk b/packages/addons/addon-depends/dotnet-runtime-depends/aspnet6-runtime/package.mk index eaafbf45c7..64dc139cac 100644 --- a/packages/addons/addon-depends/dotnet-runtime-depends/aspnet6-runtime/package.mk +++ b/packages/addons/addon-depends/dotnet-runtime-depends/aspnet6-runtime/package.mk @@ -2,7 +2,7 @@ # Copyright (C) 2022-present Team LibreELEC (https://libreelec.tv) PKG_NAME="aspnet6-runtime" -PKG_VERSION="6.0.20" +PKG_VERSION="6.0.24" PKG_LICENSE="MIT" PKG_SITE="https://dotnet.microsoft.com/" PKG_DEPENDS_TARGET="toolchain" @@ -11,16 +11,16 @@ PKG_TOOLCHAIN="manual" case "${ARCH}" in "aarch64") - PKG_SHA256="dd1898babdba27c57338b17afd4513a53025dec0985047d030336aab65532e26" - PKG_URL="https://download.visualstudio.microsoft.com/download/pr/a8a1a993-ddd9-4bcd-8386-d9defcf0fd29/4b471f72c8253fa1462ea923d0fe39a2/aspnetcore-runtime-6.0.20-linux-arm64.tar.gz" + PKG_SHA256="ee6b660b3c8b3fb88eb64690ac78a47752dae68c21647fccdc5f810bc68829ab" + PKG_URL="https://download.visualstudio.microsoft.com/download/pr/d562ba2b-8e2c-48e5-9853-f8616a9cb4e4/f4e251ba67b718083c28017e3b0c6349/aspnetcore-runtime-6.0.24-linux-arm64.tar.gz" ;; "arm") - PKG_SHA256="f26a0f36339056d65522254c4bf333c940abc3dee907d4219a64cc1456b63fe3" - PKG_URL="https://download.visualstudio.microsoft.com/download/pr/872ccb13-fbc4-4d75-9d8f-be3fec5581ef/add2199206c438835b7b48a6d061b023/aspnetcore-runtime-6.0.20-linux-arm.tar.gz" + PKG_SHA256="634b0ecd7312e8a46adedcbff6e1b23e514fa153f7135a6b9f6aefb5851f9d88" + PKG_URL="https://download.visualstudio.microsoft.com/download/pr/9c00fe25-e1e0-4390-9061-77d07e95356f/09886ffeaed522c3fa8803e879ce070c/aspnetcore-runtime-6.0.24-linux-arm.tar.gz" ;; "x86_64") - PKG_SHA256="88afcf5b6434c6a4ee12488d8bc13f84c15191712d12eb9646cf3642b9c01e86" - PKG_URL="https://download.visualstudio.microsoft.com/download/pr/972dc929-4c16-4456-a7c8-64014f80678d/a3b62252f98a0d7e0c0a9a01ede18776/aspnetcore-runtime-6.0.20-linux-x64.tar.gz" + PKG_SHA256="022dc914af7490bcd2d885edeb5d4c1faa4b771b503b8059d5181f130191cf2c" + PKG_URL="https://download.visualstudio.microsoft.com/download/pr/8f5a65c0-9bc8-497d-9ce2-4658c461dc55/b6c01c3cd060552d987501ba6bbde09f/aspnetcore-runtime-6.0.24-linux-x64.tar.gz" ;; esac PKG_SOURCE_NAME="aspnetcore-runtime_${PKG_VERSION}_${ARCH}.tar.gz" diff --git a/packages/addons/addon-depends/hidapi/package.mk b/packages/addons/addon-depends/hidapi/package.mk new file mode 100644 index 0000000000..bee13ef94b --- /dev/null +++ b/packages/addons/addon-depends/hidapi/package.mk @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Copyright (C) 2023-present Team LibreELEC (https://libreelec.tv) + +PKG_NAME="hidapi" +PKG_VERSION="0.14.0" +PKG_SHA256="a5714234abe6e1f53647dd8cba7d69f65f71c558b7896ed218864ffcf405bcbd" +PKG_LICENSE="HIDAPI-orig" +PKG_SITE="http://libusb.info/" +PKG_URL="https://github.com/libusb/hidapi/archive/refs/tags/hidapi-${PKG_VERSION}.tar.gz" +PKG_DEPENDS_TARGET="toolchain libusb" +PKG_LONGDESC="HIDAPI is a multi-platform library which allows an application to interface with USB and Bluetooth HID-Class devices." +PKG_TOOLCHAIN="cmake" diff --git a/packages/addons/addon-depends/libzip/package.mk b/packages/addons/addon-depends/libzip/package.mk index 919470a2d8..ac6129091a 100644 --- a/packages/addons/addon-depends/libzip/package.mk +++ b/packages/addons/addon-depends/libzip/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2019-present Team LibreELEC (https://libreelec.tv) PKG_NAME="libzip" -PKG_VERSION="1.9.2" -PKG_SHA256="c93e9852b7b2dc931197831438fee5295976ee0ba24f8524a8907be5c2ba5937" +PKG_VERSION="1.10.1" +PKG_SHA256="dc3c8d5b4c8bbd09626864f6bcf93de701540f761d76b85d7c7d710f4bd90318" PKG_LICENSE="GPL" PKG_SITE="https://libzip.org/" PKG_URL="https://libzip.org/download/${PKG_NAME}-${PKG_VERSION}.tar.xz" @@ -16,6 +16,7 @@ PKG_CMAKE_OPTS_TARGET="-DENABLE_COMMONCRYPTO=OFF \ -DENABLE_MBEDTLS=OFF \ -DENABLE_OPENSSL=OFF \ -DENABLE_WINDOWS_CRYPTO=OFF \ + -DENABLE_ZSTD=OFF \ -DBUILD_TOOLS=OFF \ -DBUILD_REGRESS=OFF \ -DBUILD_EXAMPLES=OFF \ diff --git a/packages/addons/addon-depends/multimedia-tools-depends/depends/libmediainfo/package.mk b/packages/addons/addon-depends/multimedia-tools-depends/depends/libmediainfo/package.mk index 97927441d6..229961b233 100644 --- a/packages/addons/addon-depends/multimedia-tools-depends/depends/libmediainfo/package.mk +++ b/packages/addons/addon-depends/multimedia-tools-depends/depends/libmediainfo/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) PKG_NAME="libmediainfo" -PKG_VERSION="22.12" -PKG_SHA256="0fc6d32f06d6ce5e144074d2e57e0db8dfa4e38e752d3123ada27ccaf89634bc" +PKG_VERSION="23.07" +PKG_SHA256="60456c8b2ab8769a6081d96fd7be86db4fe32520e4a022397cb22cacf47ce820" PKG_LICENSE="GPL" PKG_SITE="https://mediaarea.net/en/MediaInfo/Download/Source" PKG_URL="https://mediaarea.net/download/source/libmediainfo/${PKG_VERSION}/libmediainfo_${PKG_VERSION}.tar.xz" diff --git a/packages/addons/addon-depends/multimedia-tools-depends/depends/libzen/package.mk b/packages/addons/addon-depends/multimedia-tools-depends/depends/libzen/package.mk index 603bfbbbaf..bb354ad42e 100644 --- a/packages/addons/addon-depends/multimedia-tools-depends/depends/libzen/package.mk +++ b/packages/addons/addon-depends/multimedia-tools-depends/depends/libzen/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) PKG_NAME="libzen" -PKG_VERSION="0.4.40" -PKG_SHA256="0c2e1c7302b3ee260d34b52e4b16ab655bdf021db8c14653e418aced46eb24a7" +PKG_VERSION="0.4.41" +PKG_SHA256="933bad3b7ecd29dc6bdc88a83645c83dfd098c15b0b90d6177a37fa1536704e8" PKG_LICENSE="GPL" PKG_SITE="https://mediaarea.net/en/MediaInfo/" PKG_URL="https://mediaarea.net/download/source/libzen/${PKG_VERSION}/libzen_${PKG_VERSION}.tar.xz" diff --git a/packages/addons/addon-depends/multimedia-tools-depends/mediainfo/package.mk b/packages/addons/addon-depends/multimedia-tools-depends/mediainfo/package.mk index 67d98b8d09..1d371399b4 100644 --- a/packages/addons/addon-depends/multimedia-tools-depends/mediainfo/package.mk +++ b/packages/addons/addon-depends/multimedia-tools-depends/mediainfo/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) PKG_NAME="mediainfo" -PKG_VERSION="22.12" -PKG_SHA256="932b82739f738e7db603cf5bb170720731a9e7c61d145c2a54aabb3cd0b753bc" +PKG_VERSION="23.07" +PKG_SHA256="b6d7da9e29995fd34a22100825b843e74c32c7bc67adb01166b1beedea49f5d0" PKG_LICENSE="GPL" PKG_SITE="https://mediaarea.net/en/MediaInfo/Download/Source" PKG_URL="https://mediaarea.net/download/source/mediainfo/${PKG_VERSION}/mediainfo_${PKG_VERSION}.tar.xz" diff --git a/packages/addons/addon-depends/multimedia-tools-depends/mpv-drmprime/package.mk b/packages/addons/addon-depends/multimedia-tools-depends/mpv-drmprime/package.mk index d7bcdc1797..774181eb02 100644 --- a/packages/addons/addon-depends/multimedia-tools-depends/mpv-drmprime/package.mk +++ b/packages/addons/addon-depends/multimedia-tools-depends/mpv-drmprime/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2019-present Team LibreELEC (https://libreelec.tv) PKG_NAME="mpv-drmprime" -PKG_VERSION="0.35.1" -PKG_SHA256="41df981b7b84e33a2ef4478aaf81d6f4f5c8b9cd2c0d337ac142fc20b387d1a9" +PKG_VERSION="0.36.0" +PKG_SHA256="29abc44f8ebee013bb2f9fe14d80b30db19b534c679056e4851ceadf5a5e8bf6" PKG_LICENSE="GPL" PKG_SITE="https://mpv.io/" PKG_URL="https://github.com/mpv-player/mpv/archive/v${PKG_VERSION}.tar.gz" diff --git a/packages/addons/addon-depends/opus/package.mk b/packages/addons/addon-depends/opus/package.mk index 06881c6bca..572f9019aa 100644 --- a/packages/addons/addon-depends/opus/package.mk +++ b/packages/addons/addon-depends/opus/package.mk @@ -2,11 +2,11 @@ # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) PKG_NAME="opus" -PKG_VERSION="1.3.1" -PKG_SHA256="65b58e1e25b2a114157014736a3d9dfeaad8d41be1c8179866f144a2fb44ff9d" +PKG_VERSION="1.4" +PKG_SHA256="c9b32b4253be5ae63d1ff16eea06b94b5f0f2951b7a02aceef58e3a3ce49c51f" PKG_LICENSE="BSD" PKG_SITE="http://www.opus-codec.org" -PKG_URL="https://archive.mozilla.org/pub/opus/${PKG_NAME}-${PKG_VERSION}.tar.gz" +PKG_URL="https://github.com/xiph/opus/releases/download/v${PKG_VERSION}/${PKG_NAME}-${PKG_VERSION}.tar.gz" PKG_DEPENDS_TARGET="toolchain" PKG_LONGDESC="Codec designed for interactive speech and audio transmission over the Internet." PKG_TOOLCHAIN="configure" diff --git a/packages/addons/addon-depends/pngquant/package.mk b/packages/addons/addon-depends/pngquant/package.mk index 1e8f8ff1d5..e92d89f636 100644 --- a/packages/addons/addon-depends/pngquant/package.mk +++ b/packages/addons/addon-depends/pngquant/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv) PKG_NAME="pngquant" -PKG_VERSION="2.17.0" -PKG_SHA256="a27cf0e64db499ccb3ddae9b36036e881f78293e46ec27a9e7a86a3802fcda66" +PKG_VERSION="2.18.0" +PKG_SHA256="e72194b52b36f040deaec49a1ddd5dcd8d4feecc3a5fe6c5e9589a9707b233d4" PKG_LICENSE="GPLv3" PKG_SITE="https://pngquant.org" PKG_URL="https://pngquant.org/pngquant-${PKG_VERSION}-src.tar.gz" diff --git a/packages/addons/addon-depends/rsyslog-depends/libfastjson/package.mk b/packages/addons/addon-depends/rsyslog-depends/libfastjson/package.mk index 33b1d61bfb..13b6596201 100644 --- a/packages/addons/addon-depends/rsyslog-depends/libfastjson/package.mk +++ b/packages/addons/addon-depends/rsyslog-depends/libfastjson/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) PKG_NAME="libfastjson" -PKG_VERSION="0.99.9" -PKG_SHA256="a330e1bdef3096b7ead53b4bad1a6158f19ba9c9ec7c36eda57de7729d84aaee" +PKG_VERSION="1.2304.0" +PKG_SHA256="ef30d1e57a18ec770f90056aaac77300270c6203bbe476f4181cc83a2d5dc80c" PKG_LICENSE="GPL" PKG_SITE="https://www.rsyslog.com/tag/libfastjson" PKG_URL="https://download.rsyslog.com/libfastjson/${PKG_NAME}-${PKG_VERSION}.tar.gz" diff --git a/packages/addons/addon-depends/system-tools-depends/bottom/package.mk b/packages/addons/addon-depends/system-tools-depends/bottom/package.mk index e724304e57..ade916dd76 100644 --- a/packages/addons/addon-depends/system-tools-depends/bottom/package.mk +++ b/packages/addons/addon-depends/system-tools-depends/bottom/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2020-present Team LibreELEC (https://libreelec.tv) PKG_NAME="bottom" -PKG_VERSION="0.9.3" -PKG_SHA256="53a1466c3d2ed8f38401e8929cf2da796e703e4d70339d215f855b2304c07f72" +PKG_VERSION="0.9.4" +PKG_SHA256="199123ef354bcabaa8a2e3b7b477b324f5b647d503a2599d08296733846eea6e" PKG_LICENSE="MIT" PKG_SITE="https://github.com/ClementTsang/bottom" PKG_URL="https://github.com/ClementTsang/bottom/archive/${PKG_VERSION}.tar.gz" diff --git a/packages/addons/addon-depends/system-tools-depends/depends/libmtp/package.mk b/packages/addons/addon-depends/system-tools-depends/depends/libmtp/package.mk index cfb9a6647b..806c1d07ff 100644 --- a/packages/addons/addon-depends/system-tools-depends/depends/libmtp/package.mk +++ b/packages/addons/addon-depends/system-tools-depends/depends/libmtp/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) PKG_NAME="libmtp" -PKG_VERSION="1.1.20" -PKG_SHA256="c9191dac2f5744cf402e08641610b271f73ac21a3c802734ec2cedb2c6bc56d0" +PKG_VERSION="1.1.21" +PKG_SHA256="c4ffa5ab8c8f48c91b0047f2e253c101c418d5696a5ed65c839922a4280872a7" PKG_LICENSE="GPL" PKG_SITE="http://libmtp.sourceforge.net/" PKG_URL="${SOURCEFORGE_SRC}/project/${PKG_NAME}/${PKG_NAME}/${PKG_VERSION}/${PKG_NAME}-${PKG_VERSION}.tar.gz" diff --git a/packages/addons/addon-depends/system-tools-depends/mc/package.mk b/packages/addons/addon-depends/system-tools-depends/mc/package.mk index 40d0e8c829..49b78d3082 100644 --- a/packages/addons/addon-depends/system-tools-depends/mc/package.mk +++ b/packages/addons/addon-depends/system-tools-depends/mc/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) PKG_NAME="mc" -PKG_VERSION="4.8.29" -PKG_SHA256="01d8a3b94f58180cca5bf17257b5078d1fd6fd27a9b5c0e970ec767549540ad4" +PKG_VERSION="4.8.30" +PKG_SHA256="5ebc3cb2144b970c5149fda556c4ad50b78780494696cdf2d14a53204c95c7df" PKG_LICENSE="GPL" PKG_SITE="http://www.midnight-commander.org" PKG_URL="http://ftp.midnight-commander.org/mc-${PKG_VERSION}.tar.xz" diff --git a/packages/addons/addon-depends/system-tools-depends/screen/package.mk b/packages/addons/addon-depends/system-tools-depends/screen/package.mk index 2c8abfd88a..abac4b4d47 100644 --- a/packages/addons/addon-depends/system-tools-depends/screen/package.mk +++ b/packages/addons/addon-depends/system-tools-depends/screen/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) PKG_NAME="screen" -PKG_VERSION="4.9.0" -PKG_SHA256="f9335281bb4d1538ed078df78a20c2f39d3af9a4e91c57d084271e0289c730f4" +PKG_VERSION="4.9.1" +PKG_SHA256="26cef3e3c42571c0d484ad6faf110c5c15091fbf872b06fa7aa4766c7405ac69" PKG_LICENSE="GPL" PKG_SITE="http://www.gnu.org/software/screen/" PKG_URL="http://ftpmirror.gnu.org/screen/${PKG_NAME}-${PKG_VERSION}.tar.gz" diff --git a/packages/addons/addon-depends/system-tools-depends/smartmontools/package.mk b/packages/addons/addon-depends/system-tools-depends/smartmontools/package.mk index 6f8efef13c..ecb78ff7c1 100644 --- a/packages/addons/addon-depends/system-tools-depends/smartmontools/package.mk +++ b/packages/addons/addon-depends/system-tools-depends/smartmontools/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="smartmontools" -PKG_VERSION="7.3" -PKG_SHA256="a544f8808d0c58cfb0e7424ca1841cb858a974922b035d505d4e4c248be3a22b" +PKG_VERSION="7.4" +PKG_SHA256="e9a61f641ff96ca95319edfb17948cd297d0cd3342736b2c49c99d4716fb993d" PKG_LICENSE="GPL" PKG_SITE="https://www.smartmontools.org" PKG_URL="https://downloads.sourceforge.net/sourceforge/smartmontools/smartmontools-${PKG_VERSION}.tar.gz" diff --git a/packages/addons/addon-depends/system-tools-depends/stress-ng/package.mk b/packages/addons/addon-depends/system-tools-depends/stress-ng/package.mk index 5dda3fe52b..dd5abf908b 100644 --- a/packages/addons/addon-depends/system-tools-depends/stress-ng/package.mk +++ b/packages/addons/addon-depends/system-tools-depends/stress-ng/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="stress-ng" -PKG_VERSION="0.16.02" -PKG_SHA256="71ac375826cc58dcbcf5f1609959ed1a5afd71192c52025b5cb273baa3df2317" +PKG_VERSION="0.16.04" +PKG_SHA256="3453719508e9e02c57a736c154408538372d078be7dcf8e0165d37a821cdba45" PKG_LICENSE="GPLv2" PKG_SITE="https://github.com/ColinIanKing/stress-ng" PKG_URL="https://github.com/ColinIanKing/stress-ng/archive/refs/tags/V${PKG_VERSION}.tar.gz" diff --git a/packages/addons/addon-depends/system-tools-depends/stress-ng/patches/stress-ng-0001-workaround-cross-compilation-issue.patch b/packages/addons/addon-depends/system-tools-depends/stress-ng/patches/stress-ng-0001-workaround-cross-compilation-issue.patch deleted file mode 100644 index cc4a558133..0000000000 --- a/packages/addons/addon-depends/system-tools-depends/stress-ng/patches/stress-ng-0001-workaround-cross-compilation-issue.patch +++ /dev/null @@ -1,11 +0,0 @@ ---- a/Makefile.config 2023-07-13 07:58:21.000000000 +0000 -+++ b/Makefile.config 2023-07-16 05:05:35.766646855 +0000 -@@ -311,7 +311,7 @@ - compiler: configdir - @echo "checking compiler ..." - @$(CC) test/test-compiler.c -o test/test-compiler -- @echo "" > $(CONFIGS)/$$(./test/test-compiler) -+ @echo "" > $(CONFIGS)/HAVE_COMPILER_GCC - @rm -f test/test-compiler - $(call check,test-glibc,HAVE_GLIBC,using glibc) - diff --git a/packages/addons/service/librespot/changelog.txt b/packages/addons/service/librespot/changelog.txt index df2757fa99..ae4ccbae71 100644 --- a/packages/addons/service/librespot/changelog.txt +++ b/packages/addons/service/librespot/changelog.txt @@ -2,3 +2,5 @@ - update librespot to githash 03b547d (2023-04-16) 2 - update librespot to githash c964102 (2023-05-14) +3 +- update librespot to githash f037e46 (2023-07-19) diff --git a/packages/addons/service/librespot/package.mk b/packages/addons/service/librespot/package.mk index bcaf03d544..0059e8f4b2 100644 --- a/packages/addons/service/librespot/package.mk +++ b/packages/addons/service/librespot/package.mk @@ -3,10 +3,10 @@ # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv) PKG_NAME="librespot" -PKG_VERSION="c964102a349589d644baef5f43a566d6d1e151f1" -PKG_VERSION_DATE="2023-05-14" -PKG_SHA256="3bc6782d8796253040d995043fba4e6f6c71ff902da110b50398071e082b1930" -PKG_REV="2" +PKG_VERSION="f037e46aee631837a0553ccfdbc7866752fd0f5d" +PKG_VERSION_DATE="2023-07-19" +PKG_SHA256="72ec541fda77ea6a2132dd67f68a89437bfc13513481d5abbdde535976fc60c3" +PKG_REV="3" PKG_ARCH="any" PKG_LICENSE="MIT" PKG_SITE="https://github.com/librespot-org/librespot/" diff --git a/packages/addons/service/mariadb/changelog.txt b/packages/addons/service/mariadb/changelog.txt index 3a0ffce00b..7c7ea4e5f2 100644 --- a/packages/addons/service/mariadb/changelog.txt +++ b/packages/addons/service/mariadb/changelog.txt @@ -1,3 +1,6 @@ +2 +- mariadb: update to 10.11.5 + 1 - include mariadb-upgrade and mariadb-check copy mariadb* binaries and make symbolic links to mysql* diff --git a/packages/addons/service/mariadb/package.mk b/packages/addons/service/mariadb/package.mk index b25ec2b9ad..c88e713af2 100644 --- a/packages/addons/service/mariadb/package.mk +++ b/packages/addons/service/mariadb/package.mk @@ -2,9 +2,9 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="mariadb" -PKG_VERSION="10.11.2" -PKG_REV="1" -PKG_SHA256="1c89dee0caed0f68bc2a1d203eb98a123150e6a179f6ee0f1fc0ba3f08dc71dc" +PKG_VERSION="10.11.5" +PKG_REV="2" +PKG_SHA256="4c9484048d4d0c71dd076ab33fc2a9ce8510bdf762886de0d63fe52496f3dbbb" PKG_LICENSE="GPL2" PKG_SITE="https://mariadb.org" PKG_URL="https://downloads.mariadb.com/MariaDB/${PKG_NAME}-${PKG_VERSION}/source/${PKG_NAME}-${PKG_VERSION}.tar.gz" diff --git a/packages/addons/service/mariadb/patches/mariadb-0001-disable-plugin-auth-pam.patch b/packages/addons/service/mariadb/patches/mariadb-0001-disable-plugin-auth-pam.patch index dc8c6d842d..721ab3735d 100644 --- a/packages/addons/service/mariadb/patches/mariadb-0001-disable-plugin-auth-pam.patch +++ b/packages/addons/service/mariadb/patches/mariadb-0001-disable-plugin-auth-pam.patch @@ -11,7 +11,7 @@ diff --git a/cmake/build_configurations/mysql_release.cmake b/cmake/build_config index 37a6c45..e2a4ba8 100644 --- a/cmake/build_configurations/mysql_release.cmake +++ b/cmake/build_configurations/mysql_release.cmake -@@ -124,7 +124,7 @@ ENDIF() +@@ -147,7 +147,7 @@ ENDIF() IF(UNIX) SET(WITH_EXTRA_CHARSETS all CACHE STRING "") @@ -19,7 +19,7 @@ index 37a6c45..e2a4ba8 100644 + SET(PLUGIN_AUTH_PAM NO CACHE BOOL "") IF(CMAKE_SYSTEM_NAME STREQUAL "Linux") - IF(NOT IGNORE_AIO_CHECK) + FIND_PACKAGE(URING) -- 2.7.4 diff --git a/packages/addons/service/net-snmp/changelog.txt b/packages/addons/service/net-snmp/changelog.txt index 927b2407d4..df8d025e2c 100644 --- a/packages/addons/service/net-snmp/changelog.txt +++ b/packages/addons/service/net-snmp/changelog.txt @@ -1 +1,2 @@ -initial release +1 +- net-snmp: update to 5.9.4 diff --git a/packages/addons/service/net-snmp/package.mk b/packages/addons/service/net-snmp/package.mk index 64c06edf4d..d529317bad 100644 --- a/packages/addons/service/net-snmp/package.mk +++ b/packages/addons/service/net-snmp/package.mk @@ -2,9 +2,9 @@ # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) PKG_NAME="net-snmp" -PKG_VERSION="5.9.3" -PKG_SHA256="2097f29b7e1bf3f1300b4bae52fa2308d0bb8d5d3998dbe02f9462a413a2ef0a" -PKG_REV="0" +PKG_VERSION="5.9.4" +PKG_SHA256="8b4de01391e74e3c7014beb43961a2d6d6fa03acc34280b9585f4930745b0544" +PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="BSD" PKG_SITE="http://www.net-snmp.org" diff --git a/packages/addons/service/net-snmp/patches/net-snmp-0002-net-snmp-create-v3-user.in.patch b/packages/addons/service/net-snmp/patches/net-snmp-0002-net-snmp-create-v3-user.in.patch index 45dfcf9f09..ce6bf1136b 100644 --- a/packages/addons/service/net-snmp/patches/net-snmp-0002-net-snmp-create-v3-user.in.patch +++ b/packages/addons/service/net-snmp/patches/net-snmp-0002-net-snmp-create-v3-user.in.patch @@ -1,14 +1,14 @@ --- net-snmp-5.9/net-snmp-create-v3-user.in 2020-08-14 21:41:47.000000000 +0000 +++ net-snmp-5.9/net-snmp-create-v3-user.in 2021-01-14 07:04:26.196982169 +0000 -@@ -5,10 +5,8 @@ +@@ -3,10 +3,8 @@ # this shell script is designed to add new SNMPv3 users # to Net-SNMP config file. --if @PSCMD@ | egrep ' snmpd *$' > /dev/null 2>&1 ; then -- echo "Apparently at least one snmpd demon is already running." +-if @PSCMD@ | @EGREP@ ' snmpd *$' > /dev/null 2>&1 ; then +- echo "Apparently at least one snmpd daemon is already running." - echo "You must stop them in order to use this command." - exit 1 -+if @PSCMD@ | egrep 'snmpd'> /dev/null 2>&1 ; then ++if @PSCMD@ | @EGREP@ 'snmpd' > /dev/null 2>&1 ; then + systemctl stop service.net-snmp.service fi diff --git a/packages/addons/service/net-snmp/patches/net-snmp-0003-config.sub.patch b/packages/addons/service/net-snmp/patches/net-snmp-0003-config.sub.patch deleted file mode 100644 index 0da33713be..0000000000 --- a/packages/addons/service/net-snmp/patches/net-snmp-0003-config.sub.patch +++ /dev/null @@ -1,34 +0,0 @@ -diff -ur net-snmp-5.7.3/config.sub net-snmp-5.7.3.new/config.sub ---- net-snmp-5.7.3/config.sub 2014-12-08 21:23:22.000000000 +0100 -+++ net-snmp-5.7.3.new/config.sub 2017-11-09 18:39:31.638689732 +0100 -@@ -247,10 +247,11 @@ case $basic_machine in - # Some are omitted here because they have special meanings below. - 1750a | 580 \ - | a29k \ -+ | aarch64 | aarch64_be \ - | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \ - | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \ - | am33_2.0 \ -- | arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \ -+ | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \ - | bfin \ - | c4x | clipper \ - | d10v | d30v | dlx | dsp16xx \ -@@ -339,6 +340,7 @@ case $basic_machine in - # Recognize the basic CPU types with company name. - 580-* \ - | a29k-* \ -+ | aarch64-* | aarch64_be-* \ - | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \ - | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \ - | alphapca5[67]-* | alpha64pca5[67]-* | arc-* \ -@@ -1171,6 +1173,9 @@ case $basic_machine in - basic_machine=hppa1.1-winbond - os=-proelf - ;; -+ x64) -+ basic_machine=x86_64-pc -+ ;; - xbox) - basic_machine=i686-pc - os=-mingw32 diff --git a/packages/addons/service/nextpvr/changelog.txt b/packages/addons/service/nextpvr/changelog.txt index 17d0b79bb9..99dd46be22 100644 --- a/packages/addons/service/nextpvr/changelog.txt +++ b/packages/addons/service/nextpvr/changelog.txt @@ -1,3 +1,6 @@ +5 +- download NextPVR 6.1.5 + 4 - download NextPVR 6.1.4 diff --git a/packages/addons/service/nextpvr/package.mk b/packages/addons/service/nextpvr/package.mk index ab1c5a05cf..d04a05de10 100644 --- a/packages/addons/service/nextpvr/package.mk +++ b/packages/addons/service/nextpvr/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2021-present Team LibreELEC (https://libreelec.tv) PKG_NAME="nextpvr" -PKG_VERSION="6.1.4~Nexus" -PKG_ADDON_VERSION="6.1.4~4" +PKG_VERSION="6.1.5~Nexus" +PKG_ADDON_VERSION="6.1.5~5" PKG_REV="0" PKG_ARCH="any" PKG_LICENSE="NextPVR" diff --git a/packages/addons/service/rsyslog/changelog.txt b/packages/addons/service/rsyslog/changelog.txt index 927b2407d4..ac32f50133 100755 --- a/packages/addons/service/rsyslog/changelog.txt +++ b/packages/addons/service/rsyslog/changelog.txt @@ -1 +1,3 @@ -initial release +1 +- libfastjson: update to 1.2304.0 +- rsyslog: update to 8.2308.0 diff --git a/packages/addons/service/rsyslog/package.mk b/packages/addons/service/rsyslog/package.mk index 369f3de094..b9969cdac7 100644 --- a/packages/addons/service/rsyslog/package.mk +++ b/packages/addons/service/rsyslog/package.mk @@ -2,9 +2,9 @@ # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) PKG_NAME="rsyslog" -PKG_VERSION="8.2302.0" -PKG_SHA256="25415f85b662615ce3c83077d53758029e8743cb5929044bfd3564e3d626a3b9" -PKG_REV="0" +PKG_VERSION="8.2308.0" +PKG_SHA256="02086b9121e872cea69e5d0f6c8e2d8ebff33234b3cad5503665378d3af2e3c9" +PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" PKG_SITE="https://github.com/rsyslog" diff --git a/packages/addons/tools/btrfs-progs/changelog.txt b/packages/addons/tools/btrfs-progs/changelog.txt index a195358cf9..a514ca0bf4 100644 --- a/packages/addons/tools/btrfs-progs/changelog.txt +++ b/packages/addons/tools/btrfs-progs/changelog.txt @@ -1,2 +1,5 @@ +2 +- btrfs-progs: update to 6.3.3 + 1 - btrfs-progs: update to 6.2.2 diff --git a/packages/addons/tools/btrfs-progs/package.mk b/packages/addons/tools/btrfs-progs/package.mk index b2e32b4e51..aade248a60 100644 --- a/packages/addons/tools/btrfs-progs/package.mk +++ b/packages/addons/tools/btrfs-progs/package.mk @@ -2,9 +2,9 @@ # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) PKG_NAME="btrfs-progs" -PKG_VERSION="6.2.2" -PKG_SHA256="140d3d98f2cba4c7f05c16aec3038f044e11555a40c27a5006185c99a10c7ca2" -PKG_REV="1" +PKG_VERSION="6.3.3" +PKG_SHA256="0e55374e448ad4d8876db9c676669bedc16cb763e2493b14c245df8c5d00064b" +PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" PKG_SITE="https://btrfs.wiki.kernel.org/index.php/Main_Page" diff --git a/packages/addons/tools/dotnet-runtime/changelog.txt b/packages/addons/tools/dotnet-runtime/changelog.txt index 87c91c385a..de8e35617e 100644 --- a/packages/addons/tools/dotnet-runtime/changelog.txt +++ b/packages/addons/tools/dotnet-runtime/changelog.txt @@ -1,3 +1,9 @@ +7 +- aspnet6-runtime: update to 6.0.24 + +6 +- aspnet6-runtime: update to 6.0.21 + 5 - aspnet6-runtime: update to 6.0.20 diff --git a/packages/addons/tools/dotnet-runtime/package.mk b/packages/addons/tools/dotnet-runtime/package.mk index d8a2408414..839ab52aa4 100644 --- a/packages/addons/tools/dotnet-runtime/package.mk +++ b/packages/addons/tools/dotnet-runtime/package.mk @@ -2,7 +2,7 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="dotnet-runtime" -PKG_REV="5" +PKG_REV="7" PKG_ARCH="any" PKG_LICENSE="MIT" PKG_SITE="https://dotnet.microsoft.com/" diff --git a/packages/addons/tools/flirc_util/changelog.txt b/packages/addons/tools/flirc_util/changelog.txt new file mode 100644 index 0000000000..927b2407d4 --- /dev/null +++ b/packages/addons/tools/flirc_util/changelog.txt @@ -0,0 +1 @@ +initial release diff --git a/packages/addons/tools/flirc_util/icon/icon.png b/packages/addons/tools/flirc_util/icon/icon.png new file mode 100644 index 0000000000..8976d44345 Binary files /dev/null and b/packages/addons/tools/flirc_util/icon/icon.png differ diff --git a/packages/addons/tools/flirc_util/package.mk b/packages/addons/tools/flirc_util/package.mk new file mode 100644 index 0000000000..7b3febddf9 --- /dev/null +++ b/packages/addons/tools/flirc_util/package.mk @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Copyright (C) 2023-present Team LibreELEC (https://libreelec.tv) + +PKG_NAME="flirc_util" +PKG_VERSION="8d3c86e8bb419ad44297c1b186f0cdc7dfcac915" # 30/10/2023 +PKG_SHA256="fc460e6ce5477cb6b83c90a5f8b2ebb9876ed23cdd813a6a4a0fdc3730052a2b" +PKG_LICENSE="FLIRC" +PKG_SITE="http://www.flirc.tv" +PKG_URL="https://github.com/flirc/sdk/archive/${PKG_VERSION}.tar.gz" +PKG_DEPENDS_TARGET="toolchain hidapi libusb" +PKG_SECTION="tools" +PKG_SHORTDESC="CLI utility for flirc IR receivers" +PKG_LONGDESC="Command-Line utility for configuring flirc IR receivers" +PKG_TOOLCHAIN="manual" + +PKG_IS_ADDON="yes" +PKG_ADDON_NAME="flirc_util" +PKG_ADDON_TYPE="xbmc.python.script" + +make_target() { + cd cli + make VERBOSE="1" \ + CONFIG="release" \ + HOSTOS="LIBREELEC" \ + MACHINE="Linux_${TARGET_ARCH}" \ + BUILDDIR_ROOT="${PKG_BUILD}/build" \ + BUILDDIR="${PKG_BUILD}/build" \ + LSEARCH+=" -L../libs/Linux_${TARGET_ARCH}" \ + flirc_util +} + +addon() { + mkdir -p ${ADDON_BUILD}/${PKG_ADDON_ID}/{bin,lib} + cp -P ${PKG_BUILD}/build/flirc_util ${ADDON_BUILD}/${PKG_ADDON_ID}/bin/ + cp -P $(get_install_dir hidapi)/usr/lib/libhidapi-hidraw.so* ${ADDON_BUILD}/${PKG_ADDON_ID}/lib +} diff --git a/packages/addons/tools/flirc_util/source/default.py b/packages/addons/tools/flirc_util/source/default.py new file mode 100644 index 0000000000..e833d8f43e --- /dev/null +++ b/packages/addons/tools/flirc_util/source/default.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) + +import xbmcgui +import subprocess + +xbmcgui.Dialog().ok('', 'This is a console-only addon') + diff --git a/packages/addons/tools/multimedia-tools/changelog.txt b/packages/addons/tools/multimedia-tools/changelog.txt index 51b810aade..d609040653 100644 --- a/packages/addons/tools/multimedia-tools/changelog.txt +++ b/packages/addons/tools/multimedia-tools/changelog.txt @@ -1,3 +1,9 @@ +2 +- libmediainfo: update to 23.07 +- libzen: update to 0.4.41 +- mediainfo: update to 23.07 +- mpv-drmprime: update to 0.36.0 + 1 - mpg123: update to 1.31.3 - mpv-drmprime: update to 0.35.1 diff --git a/packages/addons/tools/multimedia-tools/package.mk b/packages/addons/tools/multimedia-tools/package.mk index 9b39cb0ffb..5002a1df58 100644 --- a/packages/addons/tools/multimedia-tools/package.mk +++ b/packages/addons/tools/multimedia-tools/package.mk @@ -3,7 +3,7 @@ PKG_NAME="multimedia-tools" PKG_VERSION="1.0" -PKG_REV="1" +PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" PKG_SITE="https://libreelec.tv" diff --git a/packages/addons/tools/system-tools/changelog.txt b/packages/addons/tools/system-tools/changelog.txt index 9863d92238..fd1e9a7826 100644 --- a/packages/addons/tools/system-tools/changelog.txt +++ b/packages/addons/tools/system-tools/changelog.txt @@ -1,3 +1,11 @@ +4 +- bottom: update to 0.9.4 +- libmtp: update to 1.1.21 +- mc: update to 4.8.30 +- screen: update to 4.9.1 +- smartmontools: update to 7.4 +- stress-ng: update to 0.16.04 + 3 - file: update to 5.45 - libssh2: dont build shared library diff --git a/packages/addons/tools/system-tools/package.mk b/packages/addons/tools/system-tools/package.mk index f7c2397afc..13e8dc4efe 100644 --- a/packages/addons/tools/system-tools/package.mk +++ b/packages/addons/tools/system-tools/package.mk @@ -3,7 +3,7 @@ PKG_NAME="system-tools" PKG_VERSION="1.0" -PKG_REV="3" +PKG_REV="4" PKG_ARCH="any" PKG_LICENSE="GPL" PKG_SITE="https://libreelec.tv" diff --git a/packages/graphics/bcm2835-driver/package.mk b/packages/graphics/bcm2835-driver/package.mk index 105de7f03d..7676ae51f5 100644 --- a/packages/graphics/bcm2835-driver/package.mk +++ b/packages/graphics/bcm2835-driver/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv) PKG_NAME="bcm2835-driver" -PKG_VERSION="543692d23dff7075915bc9c7e34abb3fe28e1c46" -PKG_SHA256="838aa79b842fc10030c6a8b8d7f8dc6b6b08ed7ac762a5488d3d64f1bc51a4ec" +PKG_VERSION="fdb9eafae4b83e553593937eae8e77b0193903c3" +PKG_SHA256="ae590baa29a507fa50b5beae46643519a2d2e012945668cfc7393f9275793c78" PKG_LICENSE="nonfree" PKG_SITE="http://www.broadcom.com" PKG_URL="${DISTRO_SRC}/${PKG_NAME}-${PKG_VERSION}.tar.xz" diff --git a/packages/graphics/mesa/package.mk b/packages/graphics/mesa/package.mk index 6fec8873f8..5bca413245 100644 --- a/packages/graphics/mesa/package.mk +++ b/packages/graphics/mesa/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="mesa" -PKG_VERSION="23.1.7" -PKG_SHA256="409641eadf0ed1c7794797a6f5a0b0195b5580b282166e5ec5629c6bcda6acd3" +PKG_VERSION="23.2.1" +PKG_SHA256="64de0616fc2d801f929ab1ac2a4f16b3e2783c4309a724c8a259b20df8bbc1cc" PKG_LICENSE="OSS" PKG_SITE="http://www.mesa3d.org/" PKG_URL="https://mesa.freedesktop.org/archive/mesa-${PKG_VERSION}.tar.xz" @@ -31,6 +31,12 @@ PKG_MESON_OPTS_TARGET="-Dgallium-drivers=${GALLIUM_DRIVERS// /,} \ -Dselinux=false \ -Dosmesa=false" +if [ "${DEVICE}" = "RPi5" ]; then + PKG_MESON_OPTS_TARGET+=" -Ddraw-use-llvm=false" +else + PKG_MESON_OPTS_TARGET+=" -Ddri-drivers=" +fi + if [ "${DISPLAYSERVER}" = "x11" ]; then PKG_DEPENDS_TARGET+=" xorgproto libXext libXdamage libXfixes libXxf86vm libxcb libX11 libxshmfence libXrandr" export X11_INCLUDES= diff --git a/packages/linux/package.mk b/packages/linux/package.mk index 6241e5bec6..1e7c5905b3 100644 --- a/packages/linux/package.mk +++ b/packages/linux/package.mk @@ -23,8 +23,8 @@ case "${LINUX}" in PKG_PATCH_DIRS="default" ;; raspberrypi) - PKG_VERSION="431319d91b8584c9f28b195ab9a97d7e78905aeb" # 6.1.42 - PKG_SHA256="1608f0741e02416e9af9cd0232e21a60c33db94dd0681d8f26909c2a7216038c" + PKG_VERSION="3fdb0eb8be5803a16fc308b8441cdcdeafbb944e" # 6.1.62 + PKG_SHA256="a44b7d642fc2d80a8cfd05d0b461ecbff019680932754b3f9190bb22be9b8fe2" PKG_URL="https://github.com/raspberrypi/linux/archive/${PKG_VERSION}.tar.gz" PKG_SOURCE_NAME="linux-${LINUX}-${PKG_VERSION}.tar.gz" ;; @@ -45,8 +45,8 @@ case "${LINUX}" in PKG_GIT_CLONE_BRANCH="sdm845-5.19.16" ;; *) - PKG_VERSION="6.1.42" - PKG_SHA256="aaf8261b551c8b76b81eab8780b446e88cea4d551ae517ac3a9b2dbdbd381ed3" + PKG_VERSION="6.1.58" + PKG_SHA256="ce987ed3d2f640b3a2a62a0a8573d538a36dfd3cc31e2d7a239ce5a16c1c21ad" PKG_URL="https://www.kernel.org/pub/linux/kernel/v${PKG_VERSION/.*/}.x/${PKG_NAME}-${PKG_VERSION}.tar.xz" PKG_PATCH_DIRS="default" ;; diff --git a/packages/linux/patches/default/linux-022-ASoC-hdmi-codec-Fix-broken-channel-map-reporting.patch b/packages/linux/patches/default/linux-022-ASoC-hdmi-codec-Fix-broken-channel-map-reporting.patch new file mode 100644 index 0000000000..3349cf4e1a --- /dev/null +++ b/packages/linux/patches/default/linux-022-ASoC-hdmi-codec-Fix-broken-channel-map-reporting.patch @@ -0,0 +1,57 @@ +From 5e4400b24fc1f8ad41bccb6a6bdb54b961526556 Mon Sep 17 00:00:00 2001 +From: Matthias Reichl +Date: Thu, 7 Sep 2023 20:33:25 +0200 +Subject: [PATCH] ASoC: hdmi-codec: Fix broken channel map reporting + +Commit 4e0871333661 ("ASoC: hdmi-codec: fix channel info for +compressed formats") accidentally changed hcp->chmap_idx from +ca_id, the CEA channel allocation ID, to idx, the index to +the table of channel mappings ordered by preference. + +This resulted in wrong channel maps being reported to userspace, +eg for 5.1 "FL,FR,LFE,FC" was reported instead of the expected +"FL,FR,LFE,FC,RL,RR": + +~ # speaker-test -c 6 -t sine +... + 0 - Front Left + 3 - Front Center + 1 - Front Right + 2 - LFE + 4 - Unknown + 5 - Unknown + +~ # amixer cget iface=PCM,name='Playback Channel Map' | grep ': values' + : values=3,4,8,7,0,0,0,0 + +Switch this back to ca_id in case of PCM audio so the correct channel +map is reported again and set it to HDMI_CODEC_CHMAP_IDX_UNKNOWN in +case of non-PCM audio so the PCM channel map control returns "Unknown" +channels (value 0). + +Fixes: 4e0871333661 ("ASoC: hdmi-codec: fix channel info for compressed formats") +Cc: stable@vger.kernel.org +Signed-off-by: Matthias Reichl +--- + sound/soc/codecs/hdmi-codec.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/sound/soc/codecs/hdmi-codec.c b/sound/soc/codecs/hdmi-codec.c +index 13689e718d36f..09eef6042aad6 100644 +--- a/sound/soc/codecs/hdmi-codec.c ++++ b/sound/soc/codecs/hdmi-codec.c +@@ -531,7 +531,10 @@ static int hdmi_codec_fill_codec_params(struct snd_soc_dai *dai, + hp->sample_rate = sample_rate; + hp->channels = channels; + +- hcp->chmap_idx = idx; ++ if (pcm_audio) ++ hcp->chmap_idx = ca_id; ++ else ++ hcp->chmap_idx = HDMI_CODEC_CHMAP_IDX_UNKNOWN; + + return 0; + } +-- +2.39.2 + diff --git a/packages/linux/patches/raspberrypi/linux-062-imon_pad_ignore_diagonal.patch b/packages/linux/patches/raspberrypi/linux-062-imon_pad_ignore_diagonal.patch deleted file mode 100644 index 677de3ed7f..0000000000 --- a/packages/linux/patches/raspberrypi/linux-062-imon_pad_ignore_diagonal.patch +++ /dev/null @@ -1,21 +0,0 @@ -diff -Naur linux-3.16.1/drivers/media/rc/imon.c linux-3.16.1.patch/drivers/media/rc/imon.c ---- linux-3.16.1/drivers/media/rc/imon.c 2014-08-14 04:36:35.000000000 +0200 -+++ linux-3.16.1.patch/drivers/media/rc/imon.c 2014-08-15 13:57:16.587620642 +0200 -@@ -1344,6 +1344,17 @@ - } - } else { - /* -+ * For users without stabilized, just ignore any value getting -+ * to close to the diagonal. -+ */ -+ if ((abs(rel_y) < 2 && abs(rel_x) < 2) || -+ abs(abs(rel_y) - abs(rel_x)) < 2 ) { -+ spin_lock_irqsave(&ictx->kc_lock, flags); -+ ictx->kc = KEY_UNKNOWN; -+ spin_unlock_irqrestore(&ictx->kc_lock, flags); -+ return; -+ } -+ /* - * Hack alert: instead of using keycodes, we have - * to use hard-coded scancodes here... - */ diff --git a/packages/linux/patches/raspberrypi/linux-999.02-0001-pm-disable-async-suspend-resume-by-default.patch b/packages/linux/patches/raspberrypi/linux-999.02-0001-pm-disable-async-suspend-resume-by-default.patch deleted file mode 100644 index 16ac49bee6..0000000000 --- a/packages/linux/patches/raspberrypi/linux-999.02-0001-pm-disable-async-suspend-resume-by-default.patch +++ /dev/null @@ -1,25 +0,0 @@ -From c314d9af9d774c052bea324e1a140ccdba0ca070 Mon Sep 17 00:00:00 2001 -From: Stefan Saraev -Date: Tue, 8 Apr 2014 14:02:53 +0300 -Subject: [PATCH] pm: disable async suspend/resume by default - ---- - kernel/power/main.c | 2 +- - 1 files changed, 1 insertions(+), 1 deletions(-) - -diff --git a/kernel/power/main.c b/kernel/power/main.c -index 1d1bf63..361db93 100644 ---- a/kernel/power/main.c -+++ b/kernel/power/main.c -@@ -46,7 +46,7 @@ int pm_notifier_call_chain(unsigned long val) - } - - /* If set, devices may be suspended and resumed asynchronously. */ --int pm_async_enabled = 1; -+int pm_async_enabled = 0; - - static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) --- -1.7.2.5 - diff --git a/packages/mediacenter/LibreELEC-settings/package.mk b/packages/mediacenter/LibreELEC-settings/package.mk index c17a1f41d5..3e8f0e16d8 100644 --- a/packages/mediacenter/LibreELEC-settings/package.mk +++ b/packages/mediacenter/LibreELEC-settings/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv) PKG_NAME="LibreELEC-settings" -PKG_VERSION="9a334c0857fe5ccf84af272f42dc6f6cd5c72e4b" -PKG_SHA256="71be076033ae4bcb9012a12c2fc47b0805b0e40db2e812e19613643bbcba978c" +PKG_VERSION="b920d5d83a8a7445d121d2f920169444111bf93c" +PKG_SHA256="d8147068b6172250d98d41fafd7d6dbaa286074932b537214bf0dab95fe9e99a" PKG_LICENSE="GPL" PKG_SITE="https://libreelec.tv" PKG_URL="https://github.com/LibreELEC/service.libreelec.settings/archive/${PKG_VERSION}.tar.gz" diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk index f0858dcf47..0d3068821e 100644 --- a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk +++ b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) PKG_NAME="inputstream.adaptive" -PKG_VERSION="20.3.11-Nexus" -PKG_SHA256="ed266d2a51efcd0952cfacc8549350282dce07f7c0e885eeb41d662f123e12a6" +PKG_VERSION="20.3.14-Nexus" +PKG_SHA256="59573a0d97bb665e0ada35b44f77e9bf9232adc669d0d44beccf727145d36aff" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk b/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk index 4ad6338385..a9c24da1b1 100644 --- a/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk +++ b/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="peripheral.joystick" -PKG_VERSION="20.1.10-Nexus" -PKG_SHA256="b72277358df77ed79a0e7f3ae7e9799d02692fb30408cf6e5325ce7e5a34f597" +PKG_VERSION="20.1.13-Nexus" +PKG_SHA256="9fabd0cbb54f6f4acfa16a5fa2c13e37d121bb0774e0b62ad3604a72c99b95e5" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk index 226a0bc8d8..ad59e7fd5b 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="pvr.hts" -PKG_VERSION="20.6.2-Nexus" -PKG_SHA256="e77bd87f1f4d1abc06e32d0347a0bb635bc129bec43a07864cb8a9e6b0e4d374" +PKG_VERSION="20.6.5-Nexus" +PKG_SHA256="bd58fc85543447f918ee567192c87a3beb3f6e2c3fc116abe1f584514d202ada" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk index 2a65a47d54..8f22e1c472 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="pvr.iptvsimple" -PKG_VERSION="20.11.0-Nexus" -PKG_SHA256="c2014b11dd928a1d4789279d7f3ce25af8af4047194e8406f9dfd99d16fe2ee2" +PKG_VERSION="20.13.0-Nexus" +PKG_SHA256="9edf800d7d5e755c92e9e8f6d3771a74cf3fec23b3aaec5b8535f1a579941a5b" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk index 46eaf927f6..bb90b2652d 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk @@ -3,9 +3,9 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="pvr.mythtv" -PKG_VERSION="20.3.2-Nexus" -PKG_SHA256="b1ad428bec882d3e852240cbef2378803635b530545a08421ff3baf0611a29e7" -PKG_REV="2" +PKG_VERSION="20.5.6-Nexus" +PKG_SHA256="321559f9f46a2588bdcfe9be6d6e7439911e548a92e7820dedde6cabccbe72fd" +PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" PKG_SITE="https://github.com/janbar/pvr.mythtv" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk index 299412a646..37606931c9 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="pvr.nextpvr" -PKG_VERSION="20.4.2-Nexus" -PKG_SHA256="1ce85447426ddf6d443a5e3444145a2d3af65ce73d9fb583e42cd8afc9d599a5" +PKG_VERSION="20.4.3-Nexus" +PKG_SHA256="752dff532a277797f3fefc1ced7fea6efb8d92982d9040c4080c1e6dbab203a0" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.plutotv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.plutotv/package.mk index 95b5ecfe32..5b99456089 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.plutotv/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.plutotv/package.mk @@ -2,9 +2,9 @@ # Copyright (C) 2021-present Team LibreELEC (https://libreelec.tv) PKG_NAME="pvr.plutotv" -PKG_VERSION="20.3.0-Nexus" -PKG_SHA256="d38a6bf4debc442849d01faedadcccb1b07debe850cd3c9a5789508233d22256" -PKG_REV="5" +PKG_VERSION="20.3.1a-Nexus" +PKG_SHA256="11505556200029a48a293e97f94e0469a5f78580d0e56d5d0a1da05d61f0f5b5" +PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" PKG_SITE="https://github.com/kodi-pvr/pvr.plutotv" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.waipu/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.waipu/package.mk index 5a9aeb7b8f..42a2278b09 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.waipu/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.waipu/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2019-present Team LibreELEC (https://libreelec.tv) PKG_NAME="pvr.waipu" -PKG_VERSION="20.9.0-Nexus" -PKG_SHA256="3a0a1dffe1bb8711dd6747b02a51aee6a0bce40ca6822cc2ab7f04b4b5acb82f" +PKG_VERSION="20.10.2-Nexus" +PKG_SHA256="2326c3ed0e57ef8020c1041ea6f25fa325c619588c24c71b5963d8dda1c97604" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi/config/70-libinput-ignore-power-button.rules b/packages/mediacenter/kodi/config/70-libinput-ignore-power-button.rules new file mode 100644 index 0000000000..775d814c1f --- /dev/null +++ b/packages/mediacenter/kodi/config/70-libinput-ignore-power-button.rules @@ -0,0 +1,15 @@ +# Ignore power button input devices in libinput so logind can handle them +ACTION=="remove", GOTO="end" +SUBSYSTEM!="input", GOTO="end" +KERNEL!="event*", GOTO="end" + +IMPORT{parent}="KEY" + +# match devices that only generate KEY_POWER (code 116) events +ENV{KEY}=="10000000000000 0", ENV{LIBINPUT_IGNORE_DEVICE}="1" + +# 32bit systems report the bitmap in 32bit chunks +ENV{KEY}=="100000 0 0 0", ENV{LIBINPUT_IGNORE_DEVICE}="1" + +LABEL="end" + diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk index 23236911d8..b94b348c21 100644 --- a/packages/mediacenter/kodi/package.mk +++ b/packages/mediacenter/kodi/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv) PKG_NAME="kodi" -PKG_VERSION="20.2-Nexus" -PKG_SHA256="4e81abf81172812bc8891f69a7a80a2b846298cecaae7b5009725e28a3040c23" +PKG_VERSION="618d1e35d89f1c49c2a37f5d233319f3f3bbe01b" +PKG_SHA256="2831ca6c004dde11105c14337b58e89b24cd735596510f1ab85d15f6234a835c" PKG_LICENSE="GPL" PKG_SITE="http://www.kodi.tv" PKG_URL="https://github.com/xbmc/xbmc/archive/${PKG_VERSION}.tar.gz" @@ -408,6 +408,12 @@ post_makeinstall_target() { mkdir -p ${INSTALL}/usr/cache/libreelec cp ${PKG_DIR}/config/network_wait ${INSTALL}/usr/cache/libreelec + # GBM: install udev rule to ignore the power button in libinput/kodi so logind can handle it + if [ "${DISPLAYSERVER}" = "no" ]; then + mkdir -p ${INSTALL}/usr/lib/udev/rules.d/ + cp ${PKG_DIR}/config/70-libinput-ignore-power-button.rules ${INSTALL}/usr/lib/udev/rules.d/ + fi + # update addon manifest ADDON_MANIFEST=${INSTALL}/usr/share/kodi/system/addon-manifest.xml xmlstarlet ed -L -d "/addons/addon[text()='service.xbmc.versioncheck']" ${ADDON_MANIFEST} diff --git a/packages/mediacenter/kodi/patches/kodi-995.21-keymaps-change-remote-poweroff-action-to-show-shutdo.patch b/packages/mediacenter/kodi/patches/kodi-995.21-keymaps-change-remote-poweroff-action-to-show-shutdo.patch new file mode 100644 index 0000000000..fe611fb14f --- /dev/null +++ b/packages/mediacenter/kodi/patches/kodi-995.21-keymaps-change-remote-poweroff-action-to-show-shutdo.patch @@ -0,0 +1,26 @@ +From 5604be6a6701e0bd68cb36fadb05cecba57f7887 Mon Sep 17 00:00:00 2001 +From: Matthias Reichl +Date: Fri, 22 Sep 2023 23:41:51 +0200 +Subject: [PATCH] keymaps: change remote poweroff action to show shutdown menu + +Signed-off-by: Matthias Reichl +--- + system/keymaps/remote.xml | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/system/keymaps/remote.xml b/system/keymaps/remote.xml +index c122b99188..baebf679c0 100644 +--- a/system/keymaps/remote.xml ++++ b/system/keymaps/remote.xml +@@ -50,7 +50,7 @@ + VolumeUp + VolumeDown + Mute +- ShutDown() ++ ActivateWindow(ShutdownMenu) + ActivateWindow(Videos) + ActivateWindow(Music) + ActivateWindow(Pictures) +-- +2.39.2 + diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk index 80a7390f88..3c155232c5 100644 --- a/packages/multimedia/ffmpeg/package.mk +++ b/packages/multimedia/ffmpeg/package.mk @@ -64,6 +64,12 @@ if [ "${V4L2_SUPPORT}" = "yes" -a ! "${DEVICE}" = "Switch" ]; then if [ "${PROJECT}" = "Allwinner" -o "${PROJECT}" = "Rockchip" -o "${DEVICE}" = "iMX8" -o "${DEVICE}" = "RPi4" ]; then PKG_V4L2_REQUEST="yes" + elif [ "${PROJECT}" = "RPi" ] && [ "${DEVICE}" = "RPi4" -o "${DEVICE}" = "RPi5" ]; then + PKG_V4L2_REQUEST="yes" + PKG_FFMPEG_HWACCEL="--disable-hwaccel=h264_v4l2request \ + --disable-hwaccel=mpeg2_v4l2request \ + --disable-hwaccel=vp8_v4l2request \ + --disable-hwaccel=vp9_v4l2request" else PKG_V4L2_REQUEST="no" fi diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch index 27c1326476..cd84890c43 100755 --- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch +++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch @@ -1,53 +1,70710 @@ -From 504df93cfe5416b394755e79b7b81ee0119cf09c Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 26 Apr 2021 12:34:50 +0100 -Subject: [PATCH 001/136] Add pi configs and scripts - ---- - pi-util/BUILD.txt | 59 ++++++++ - pi-util/NOTES.txt | 69 +++++++++ - pi-util/TESTMESA.txt | 82 +++++++++++ - pi-util/clean_usr_libs.sh | 26 ++++ - pi-util/conf_arm64_native.sh | 45 ++++++ - pi-util/conf_h265.2016.csv | 195 ++++++++++++++++++++++++++ - pi-util/conf_h265.2016_HEVC_v1.csv | 147 ++++++++++++++++++++ - pi-util/conf_h265.csv | 144 +++++++++++++++++++ - pi-util/conf_native.sh | 108 +++++++++++++++ - pi-util/ffconf.py | 215 +++++++++++++++++++++++++++++ - pi-util/ffperf.py | 128 +++++++++++++++++ - pi-util/genpatch.sh | 35 +++++ - pi-util/make_array.py | 23 +++ - pi-util/mkinst.sh | 5 + - pi-util/patkodi.sh | 9 ++ - pi-util/perfcmp.py | 101 ++++++++++++++ - pi-util/qem.sh | 9 ++ - pi-util/v3dusage.py | 128 +++++++++++++++++ - 18 files changed, 1528 insertions(+) - create mode 100644 pi-util/BUILD.txt - create mode 100644 pi-util/NOTES.txt - create mode 100644 pi-util/TESTMESA.txt - create mode 100755 pi-util/clean_usr_libs.sh - create mode 100644 pi-util/conf_arm64_native.sh - create mode 100644 pi-util/conf_h265.2016.csv - create mode 100644 pi-util/conf_h265.2016_HEVC_v1.csv - create mode 100644 pi-util/conf_h265.csv - create mode 100755 pi-util/conf_native.sh - create mode 100755 pi-util/ffconf.py - create mode 100755 pi-util/ffperf.py - create mode 100755 pi-util/genpatch.sh - create mode 100755 pi-util/make_array.py - create mode 100755 pi-util/mkinst.sh - create mode 100644 pi-util/patkodi.sh - create mode 100755 pi-util/perfcmp.py - create mode 100755 pi-util/qem.sh - create mode 100755 pi-util/v3dusage.py - +diff --git a/configure b/configure +index 4ba72bf84b..f2fc33e89b 100755 +--- a/configure ++++ b/configure +@@ -207,6 +207,7 @@ External library support: + --disable-bzlib disable bzlib [autodetect] + --disable-coreimage disable Apple CoreImage framework [autodetect] + --enable-chromaprint enable audio fingerprinting with chromaprint [no] ++ --disable-epoxy disable epoxy [autodetect] + --enable-frei0r enable frei0r video filtering [no] + --enable-gcrypt enable gcrypt, needed for rtmp(t)e support + if openssl, librtmp or gmp is not used [no] +@@ -279,6 +280,7 @@ External library support: + if openssl, gnutls or mbedtls is not used [no] + --enable-libtwolame enable MP2 encoding via libtwolame [no] + --enable-libuavs3d enable AVS3 decoding via libuavs3d [no] ++ --disable-libudev disable libudev [autodetect] + --enable-libv4l2 enable libv4l2/v4l-utils [no] + --enable-libvidstab enable video stabilization using vid.stab [no] + --enable-libvmaf enable vmaf filter via libvmaf [no] +@@ -340,12 +342,17 @@ External library support: + --enable-libmfx enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no] + --enable-libnpp enable Nvidia Performance Primitives-based code [no] + --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no] ++ --enable-rpi enable other rpi specific stuff [no] ++ --enable-sand enable sand video formats [rpi] ++ --enable-vout-drm enable the vout_drm module - for internal testing only [no] ++ --enable-vout-egl enable the vout_egl module - for internal testing only [no] + --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect] + --disable-nvenc disable Nvidia video encoding code [autodetect] + --enable-omx enable OpenMAX IL code [no] + --enable-omx-rpi enable OpenMAX IL code for Raspberry Pi [no] + --enable-rkmpp enable Rockchip Media Process Platform code [no] + --disable-v4l2-m2m disable V4L2 mem2mem code [autodetect] ++ --enable-v4l2-request enable V4L2 request API code [no] + --disable-vaapi disable Video Acceleration API (mainly Unix/Intel) code [autodetect] + --disable-vdpau disable Nvidia Video Decode and Presentation API for Unix code [autodetect] + --disable-videotoolbox disable VideoToolbox code [autodetect] +@@ -1703,7 +1710,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST=" + avfoundation + bzlib + coreimage ++ epoxy + iconv ++ libudev + libxcb + libxcb_shm + libxcb_shape +@@ -1868,7 +1877,10 @@ HWACCEL_LIBRARY_LIST=" + mmal + omx + opencl ++ v4l2_request + vulkan ++ rpi4_8 ++ rpi4_10 + " + + DOCUMENT_LIST=" +@@ -1884,12 +1896,17 @@ FEATURE_LIST=" + gray + hardcoded_tables + omx_rpi ++ rpi + runtime_cpudetect + safe_bitstream_reader ++ sand + shared + small + static + swscale_alpha ++ vout_drm ++ vout_egl ++ v4l2_req_hevc_vx + " + + # this list should be kept in linking order +@@ -1930,6 +1947,7 @@ SUBSYSTEM_LIST=" + pixelutils + network + rdft ++ rpi + " + + # COMPONENT_LIST needs to come last to ensure correct dependency checking +@@ -2416,9 +2434,11 @@ CONFIG_EXTRA=" + rangecoder + riffdec + riffenc ++ rpi + rtpdec + rtpenc_chain + rv34dsp ++ sand + scene_sad + sinewin + snappy +@@ -2750,6 +2770,8 @@ hap_decoder_select="snappy texturedsp" + hap_encoder_deps="libsnappy" + hap_encoder_select="texturedspenc" + hevc_decoder_select="atsc_a53 bswapdsp cabac golomb hevcparse videodsp" ++hevc_rpi_decoder_deps="rpi" ++hevc_rpi_decoder_select="hevc_decoder sand" + huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp" + huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp" + hymt_decoder_select="huffyuv_decoder" +@@ -2920,6 +2942,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext" + dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32" + ffnvcodec_deps_any="libdl LoadLibrary" + nvdec_deps="ffnvcodec" ++v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev" + vaapi_x11_deps="xlib" + videotoolbox_hwaccel_deps="videotoolbox pthreads" + videotoolbox_hwaccel_extralibs="-framework QuartzCore" +@@ -2961,6 +2984,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC" + hevc_dxva2_hwaccel_select="hevc_decoder" + hevc_nvdec_hwaccel_deps="nvdec" + hevc_nvdec_hwaccel_select="hevc_decoder" ++hevc_v4l2request_hwaccel_deps="v4l2_request" ++hevc_v4l2request_hwaccel_select="hevc_decoder" ++hevc_rpi4_10_hwaccel_deps="rpi" ++hevc_rpi4_10_hwaccel_select="hevc_decoder" ++hevc_rpi4_8_hwaccel_deps="rpi" ++hevc_rpi4_8_hwaccel_select="hevc_decoder" + hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC" + hevc_vaapi_hwaccel_select="hevc_decoder" + hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC" +@@ -3438,8 +3467,13 @@ sndio_indev_deps="sndio" + sndio_outdev_deps="sndio" + v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h" + v4l2_indev_suggest="libv4l2" ++v4l2_outdev_deps="libdrm" + v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h" + v4l2_outdev_suggest="libv4l2" ++vout_drm_outdev_deps="libdrm" ++vout_egl_outdev_deps="xlib epoxy" ++vout_rpi_outdev_deps="rpi" ++vout_rpi_outdev_select="sand" + vfwcap_indev_deps="vfw32 vfwcap_defines" + xcbgrab_indev_deps="libxcb" + xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes" +@@ -3658,6 +3692,7 @@ tonemap_vaapi_filter_deps="vaapi VAProcFilterParameterBufferHDRToneMapping" + tonemap_opencl_filter_deps="opencl const_nan" + transpose_opencl_filter_deps="opencl" + transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags" ++unsand_filter_select="sand" + unsharp_opencl_filter_deps="opencl" + uspp_filter_deps="gpl avcodec" + vaguedenoiser_filter_deps="gpl" +@@ -6155,6 +6190,12 @@ check_func_headers glob.h glob + enabled xlib && + check_lib xlib "X11/Xlib.h X11/extensions/Xvlib.h" XvGetPortAttribute -lXv -lX11 -lXext + ++enabled libudev && ++ check_pkg_config libudev libudev libudev.h udev_new ++ ++enabled epoxy && ++ check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version ++ + check_headers direct.h + check_headers dirent.h + check_headers dxgidebug.h +@@ -6492,11 +6533,12 @@ enabled mbedtls && { check_pkg_config mbedtls mbedtls mbedtls/x509_crt + check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto || + die "ERROR: mbedTLS not found"; } + enabled mediacodec && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; } +-enabled mmal && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host || ++( enabled rpi || ++ enabled mmal ) && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host || + { ! enabled cross_compile && + add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline && + add_ldflags -L/opt/vc/lib/ && +- check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host; } || ++ check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } || + die "ERROR: mmal not found" && + check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; } + enabled openal && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do +@@ -6537,8 +6579,16 @@ enabled rkmpp && { require_pkg_config rkmpp rockchip_mpp rockchip/r + { enabled libdrm || + die "ERROR: rkmpp requires --enable-libdrm"; } + } ++enabled v4l2_request && { enabled libdrm || ++ die "ERROR: v4l2-request requires --enable-libdrm"; } && ++ { enabled libudev || ++ die "ERROR: v4l2-request requires libudev"; } + enabled vapoursynth && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init + ++enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; } ++ ++enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } && ++ { enabled xlib || die "ERROR: vout_egl requires xlib"; } + + if enabled gcrypt; then + GCRYPT_CONFIG="${cross_prefix}libgcrypt-config" +@@ -6618,6 +6668,10 @@ if enabled v4l2_m2m; then + check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;" + fi + ++check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns ++check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;" ++disable v4l2_req_hevc_vx ++ + check_headers sys/videoio.h + test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete + +@@ -7105,6 +7159,9 @@ check_deps $CONFIG_LIST \ + enabled threads && ! enabled pthreads && ! enabled atomics_native && die "non pthread threading without atomics not supported, try adding --enable-pthreads or --cpu=i486 or higher if you are on x86" + enabled avresample && warn "Building with deprecated library libavresample" + ++# Sub-feature of hevc_v4l2request_hwaccel - can only be set once deps are done ++enabled hevc_v4l2request_hwaccel && disabled hevc_v4l2_request && enable v4l2_req_hevc_vx ++ + case $target_os in + haiku) + disable memalign +diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c +index dec012a299..8aa13007f9 100644 +--- a/fftools/ffmpeg.c ++++ b/fftools/ffmpeg.c +@@ -2189,8 +2189,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame) + ifilter->channel_layout != frame->channel_layout; + break; + case AVMEDIA_TYPE_VIDEO: +- need_reinit |= ifilter->width != frame->width || +- ifilter->height != frame->height; ++ need_reinit |= ifilter->width != av_frame_cropped_width(frame) || ++ ifilter->height != av_frame_cropped_height(frame); + break; + } + +@@ -2201,6 +2201,9 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame) + (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data)) + need_reinit = 1; + ++ if (no_cvt_hw && fg->graph) ++ need_reinit = 0; ++ + if (need_reinit) { + ret = ifilter_parameters_from_frame(ifilter, frame); + if (ret < 0) +@@ -2469,8 +2472,7 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_ + decoded_frame->top_field_first = ist->top_field_first; + + ist->frames_decoded++; +- +- if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) { ++ if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) { + err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame); + if (err < 0) + goto fail; +@@ -2674,7 +2676,12 @@ static int process_input_packet(InputStream *ist, const AVPacket *pkt, int no_eo + case AVMEDIA_TYPE_VIDEO: + ret = decode_video (ist, repeating ? NULL : avpkt, &got_output, &duration_pts, !pkt, + &decode_failed); +- if (!repeating || !pkt || got_output) { ++ // Pi: Do not inc dts if no_cvt_hw set ++ // V4L2 H264 decode has long latency and sometimes spits out a long ++ // stream of output without input. In this case incrementing DTS is wrong. ++ // There may be cases where the condition as written is correct so only ++ // "fix" in the cases which cause problems ++ if (!repeating || !pkt || (got_output && !no_cvt_hw)) { + if (pkt && pkt->duration) { + duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q); + } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) { +@@ -2898,6 +2905,16 @@ static enum AVPixelFormat get_format(AVCodecContext *s, const enum AVPixelFormat + } else { + const HWAccel *hwaccel = NULL; + int i; ++ ++ if (no_cvt_hw) { ++ config = avcodec_get_hw_config(s->codec, 0); ++ if (config->methods == AV_CODEC_HW_CONFIG_METHOD_INTERNAL) { ++ av_log(s, AV_LOG_DEBUG, "no_cvt_hw so accepting pix_fmt %d with codec internal hwaccel\n", *p); ++ ist->hwaccel_pix_fmt = *p; ++ break; ++ } ++ } ++ + for (i = 0; hwaccels[i].name; i++) { + if (hwaccels[i].pix_fmt == *p) { + hwaccel = &hwaccels[i]; +@@ -2993,6 +3010,15 @@ static int init_input_stream(int ist_index, char *error, int error_len) + return ret; + } + ++#if CONFIG_HEVC_RPI_DECODER ++ ret = -1; ++ if (strcmp(codec->name, "hevc_rpi") == 0 && ++ (ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) { ++ ist->dec = codec = avcodec_find_decoder_by_name("hevc"); ++ av_log(NULL, AV_LOG_INFO, "Failed to open hevc_rpi - trying hevc\n"); ++ } ++ if (ret < 0) ++#endif + if ((ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) { + if (ret == AVERROR_EXPERIMENTAL) + abort_codec_experimental(codec, 0); +diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h +index 606f2afe0c..448cd2e009 100644 +--- a/fftools/ffmpeg.h ++++ b/fftools/ffmpeg.h +@@ -61,6 +61,7 @@ enum HWAccelID { + HWACCEL_GENERIC, + HWACCEL_VIDEOTOOLBOX, + HWACCEL_QSV, ++ HWACCEL_RPI, + }; + + typedef struct HWAccel { +@@ -611,6 +612,7 @@ extern int video_sync_method; + extern float frame_drop_threshold; + extern int do_benchmark; + extern int do_benchmark_all; ++extern int no_cvt_hw; + extern int do_deinterlace; + extern int do_hex_dump; + extern int do_pkt_dump; +diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c +index 4ab769c07b..5cdc3a7b6c 100644 +--- a/fftools/ffmpeg_filter.c ++++ b/fftools/ffmpeg_filter.c +@@ -1160,8 +1160,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame) + + ifilter->format = frame->format; + +- ifilter->width = frame->width; +- ifilter->height = frame->height; ++ ifilter->width = av_frame_cropped_width(frame); ++ ifilter->height = av_frame_cropped_height(frame); + ifilter->sample_aspect_ratio = frame->sample_aspect_ratio; + + ifilter->sample_rate = frame->sample_rate; +diff --git a/fftools/ffmpeg_hw.c b/fftools/ffmpeg_hw.c +index fc4a5d31d6..cc69dce40e 100644 +--- a/fftools/ffmpeg_hw.c ++++ b/fftools/ffmpeg_hw.c +@@ -75,6 +75,8 @@ static char *hw_device_default_name(enum AVHWDeviceType type) + char *name; + size_t index_pos; + int index, index_limit = 1000; ++ if (!type_name) ++ return NULL; + index_pos = strlen(type_name); + name = av_malloc(index_pos + 4); + if (!name) +diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c +index 807e783422..456d4f349b 100644 +--- a/fftools/ffmpeg_opt.c ++++ b/fftools/ffmpeg_opt.c +@@ -133,12 +133,22 @@ static const char *const opt_name_enc_time_bases[] = {"enc_time_base" + }\ + } + ++#if CONFIG_RPI ++static int rpi_init(AVCodecContext *avctx) { ++ return 0; ++} ++#endif ++ + const HWAccel hwaccels[] = { + #if CONFIG_VIDEOTOOLBOX + { "videotoolbox", videotoolbox_init, HWACCEL_VIDEOTOOLBOX, AV_PIX_FMT_VIDEOTOOLBOX }, + #endif + #if CONFIG_LIBMFX + { "qsv", qsv_init, HWACCEL_QSV, AV_PIX_FMT_QSV }, ++#endif ++#if CONFIG_RPI ++ { "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_8 }, ++ { "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_10 }, + #endif + { 0 }, + }; +@@ -158,6 +168,7 @@ float frame_drop_threshold = 0; + int do_deinterlace = 0; + int do_benchmark = 0; + int do_benchmark_all = 0; ++int no_cvt_hw = 0; + int do_hex_dump = 0; + int do_pkt_dump = 0; + int copy_ts = 0; +@@ -3499,6 +3510,8 @@ const OptionDef options[] = { + "add timings for benchmarking" }, + { "benchmark_all", OPT_BOOL | OPT_EXPERT, { &do_benchmark_all }, + "add timings for each task" }, ++ { "no_cvt_hw", OPT_BOOL | OPT_EXPERT, { &no_cvt_hw }, ++ "do not auto-convert hw frames to sw" }, + { "progress", HAS_ARG | OPT_EXPERT, { .func_arg = opt_progress }, + "write program-readable progress information", "url" }, + { "stdin", OPT_BOOL | OPT_EXPERT, { &stdin_interaction }, +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index 33a280cf69..e93c842047 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -19,6 +19,7 @@ HEADERS = ac3_parser.h \ + mediacodec.h \ + packet.h \ + qsv.h \ ++ rpi_zc.h \ + vaapi.h \ + vdpau.h \ + version.h \ +@@ -140,6 +141,7 @@ OBJS-$(CONFIG_QSVDEC) += qsvdec.o + OBJS-$(CONFIG_QSVENC) += qsvenc.o + OBJS-$(CONFIG_RANGECODER) += rangecoder.o + OBJS-$(CONFIG_RDFT) += rdft.o ++OBJS-$(CONFIG_RPI) += rpi_qpu.o rpi_mailbox.o rpi_zc.o + OBJS-$(CONFIG_RV34DSP) += rv34dsp.o + OBJS-$(CONFIG_SHARED) += log2_tab.o reverse.o + OBJS-$(CONFIG_SINEWIN) += sinewin.o +@@ -154,7 +156,10 @@ OBJS-$(CONFIG_VIDEODSP) += videodsp.o + OBJS-$(CONFIG_VP3DSP) += vp3dsp.o + OBJS-$(CONFIG_VP56DSP) += vp56dsp.o + OBJS-$(CONFIG_VP8DSP) += vp8dsp.o +-OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o ++OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\ ++ weak_link.o v4l2_req_dmabufs.o ++OBJS-$(CONFIG_V4L2_REQUEST) += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\ ++ v4l2_req_devscan.o weak_link.o + OBJS-$(CONFIG_WMA_FREQS) += wma_freqs.o + OBJS-$(CONFIG_WMV2DSP) += wmv2dsp.o + +@@ -403,6 +408,14 @@ OBJS-$(CONFIG_HEVC_QSV_DECODER) += qsvdec.o + OBJS-$(CONFIG_HEVC_QSV_ENCODER) += qsvenc_hevc.o hevc_ps_enc.o \ + hevc_data.o + OBJS-$(CONFIG_HEVC_RKMPP_DECODER) += rkmppdec.o ++OBJS-$(CONFIG_RPI) += rpi_mem.o \ ++ rpi_mailbox.o rpi_zc.o ++OBJS-$(CONFIG_HEVC_RPI_DECODER) += rpi_hevcdec.o rpi_hevc_mvs.o \ ++ rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o \ ++ rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o \ ++ rpi_hevc_shader.o rpi_hevc_shader_template.o \ ++ rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \ ++ rpi_hevc_sei.o rpi_hevc_data.o rpi_qpu.o rpi_mem.o + OBJS-$(CONFIG_HEVC_VAAPI_ENCODER) += vaapi_encode_h265.o h265_profile_level.o + OBJS-$(CONFIG_HEVC_V4L2M2M_DECODER) += v4l2_m2m_dec.o + OBJS-$(CONFIG_HEVC_V4L2M2M_ENCODER) += v4l2_m2m_enc.o +@@ -941,6 +954,10 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL) += dxva2_hevc.o + OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o + OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o + OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec.o ++OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL) += rpivid_hevc.o ++OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL) += rpivid_hevc.o ++OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o v4l2_req_hevc_v4.o ++OBJS-$(CONFIG_V4L2_REQ_HEVC_VX) += v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o + OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o + OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o h265_profile_level.o + OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o +@@ -1297,3 +1314,31 @@ $(SUBDIR)pcm.o: $(SUBDIR)pcm_tables.h + $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h + $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h + endif ++ ++ifdef CONFIG_HEVC_RPI_DECODER ++QASM_PY := ../local/bin/qasm.py ++VASMVIDCORE := ../local/bin/vasmvidcore_std ++ ++ifneq ("$(wildcard $(QASM_PY))","") ++$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm ++ $(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@ ++ ++$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm ++ $(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@ ++endif ++ ++ifneq ("$(wildcard $(VASMVIDCORE))","") ++$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s ++ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@ ++$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s ++ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@ ++ ++$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin ++ python pi-util/make_array.py $< ++$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin ++ python pi-util/make_array.py $< ++endif ++ ++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h ++$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h ++endif +diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile +index 954461f81d..c8935f205e 100644 +--- a/libavcodec/aarch64/Makefile ++++ b/libavcodec/aarch64/Makefile +@@ -44,10 +44,12 @@ NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o + NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \ + aarch64/hpeldsp_neon.o + NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o +-NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o ++NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \ ++ aarch64/simple_idct_neon.o + NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o + NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o + NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o ++NEON-OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_neon.o + NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o + + # decoders/encoders +diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c b/libavcodec/aarch64/idctdsp_init_aarch64.c +index 742a3372e3..eec21aa5a2 100644 +--- a/libavcodec/aarch64/idctdsp_init_aarch64.c ++++ b/libavcodec/aarch64/idctdsp_init_aarch64.c +@@ -27,19 +27,29 @@ + #include "libavcodec/idctdsp.h" + #include "idct.h" + ++void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); ++void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); ++void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); ++ + av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) + { + int cpu_flags = av_get_cpu_flags(); + +- if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) { +- if (avctx->idct_algo == FF_IDCT_AUTO || +- avctx->idct_algo == FF_IDCT_SIMPLEAUTO || +- avctx->idct_algo == FF_IDCT_SIMPLENEON) { +- c->idct_put = ff_simple_idct_put_neon; +- c->idct_add = ff_simple_idct_add_neon; +- c->idct = ff_simple_idct_neon; +- c->perm_type = FF_IDCT_PERM_PARTTRANS; ++ if (have_neon(cpu_flags)) { ++ if (!avctx->lowres && !high_bit_depth) { ++ if (avctx->idct_algo == FF_IDCT_AUTO || ++ avctx->idct_algo == FF_IDCT_SIMPLEAUTO || ++ avctx->idct_algo == FF_IDCT_SIMPLENEON) { ++ c->idct_put = ff_simple_idct_put_neon; ++ c->idct_add = ff_simple_idct_add_neon; ++ c->idct = ff_simple_idct_neon; ++ c->perm_type = FF_IDCT_PERM_PARTTRANS; ++ } + } ++ ++ c->add_pixels_clamped = ff_add_pixels_clamped_neon; ++ c->put_pixels_clamped = ff_put_pixels_clamped_neon; ++ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; + } + } +diff --git a/libavcodec/aarch64/idctdsp_neon.S b/libavcodec/aarch64/idctdsp_neon.S +new file mode 100644 +index 0000000000..7f47611206 +--- /dev/null ++++ b/libavcodec/aarch64/idctdsp_neon.S +@@ -0,0 +1,130 @@ ++/* ++ * IDCT AArch64 NEON optimisations ++ * ++ * Copyright (c) 2022 Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/aarch64/asm.S" ++ ++// Clamp 16-bit signed block coefficients to unsigned 8-bit ++// On entry: ++// x0 -> array of 64x 16-bit coefficients ++// x1 -> 8-bit results ++// x2 = row stride for results, bytes ++function ff_put_pixels_clamped_neon, export=1 ++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 ++ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0] ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ sqxtun v4.8b, v4.8h ++ st1 {v0.8b}, [x1], x2 ++ sqxtun v0.8b, v5.8h ++ st1 {v1.8b}, [x1], x2 ++ sqxtun v1.8b, v6.8h ++ st1 {v2.8b}, [x1], x2 ++ sqxtun v2.8b, v7.8h ++ st1 {v3.8b}, [x1], x2 ++ st1 {v4.8b}, [x1], x2 ++ st1 {v0.8b}, [x1], x2 ++ st1 {v1.8b}, [x1], x2 ++ st1 {v2.8b}, [x1] ++ ret ++endfunc ++ ++// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128) ++// On entry: ++// x0 -> array of 64x 16-bit coefficients ++// x1 -> 8-bit results ++// x2 = row stride for results, bytes ++function ff_put_signed_pixels_clamped_neon, export=1 ++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 ++ movi v4.8b, #128 ++ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] ++ sqxtn v0.8b, v0.8h ++ sqxtn v1.8b, v1.8h ++ sqxtn v2.8b, v2.8h ++ sqxtn v3.8b, v3.8h ++ sqxtn v5.8b, v16.8h ++ add v0.8b, v0.8b, v4.8b ++ sqxtn v6.8b, v17.8h ++ add v1.8b, v1.8b, v4.8b ++ sqxtn v7.8b, v18.8h ++ add v2.8b, v2.8b, v4.8b ++ sqxtn v16.8b, v19.8h ++ add v3.8b, v3.8b, v4.8b ++ st1 {v0.8b}, [x1], x2 ++ add v0.8b, v5.8b, v4.8b ++ st1 {v1.8b}, [x1], x2 ++ add v1.8b, v6.8b, v4.8b ++ st1 {v2.8b}, [x1], x2 ++ add v2.8b, v7.8b, v4.8b ++ st1 {v3.8b}, [x1], x2 ++ add v3.8b, v16.8b, v4.8b ++ st1 {v0.8b}, [x1], x2 ++ st1 {v1.8b}, [x1], x2 ++ st1 {v2.8b}, [x1], x2 ++ st1 {v3.8b}, [x1] ++ ret ++endfunc ++ ++// Add 16-bit signed block coefficients to unsigned 8-bit ++// On entry: ++// x0 -> array of 64x 16-bit coefficients ++// x1 -> 8-bit input and results ++// x2 = row stride for 8-bit input and results, bytes ++function ff_add_pixels_clamped_neon, export=1 ++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 ++ mov x3, x1 ++ ld1 {v4.8b}, [x1], x2 ++ ld1 {v5.8b}, [x1], x2 ++ ld1 {v6.8b}, [x1], x2 ++ ld1 {v7.8b}, [x1], x2 ++ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] ++ uaddw v0.8h, v0.8h, v4.8b ++ uaddw v1.8h, v1.8h, v5.8b ++ uaddw v2.8h, v2.8h, v6.8b ++ ld1 {v4.8b}, [x1], x2 ++ uaddw v3.8h, v3.8h, v7.8b ++ ld1 {v5.8b}, [x1], x2 ++ sqxtun v0.8b, v0.8h ++ ld1 {v6.8b}, [x1], x2 ++ sqxtun v1.8b, v1.8h ++ ld1 {v7.8b}, [x1] ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ uaddw v4.8h, v16.8h, v4.8b ++ st1 {v0.8b}, [x3], x2 ++ uaddw v0.8h, v17.8h, v5.8b ++ st1 {v1.8b}, [x3], x2 ++ uaddw v1.8h, v18.8h, v6.8b ++ st1 {v2.8b}, [x3], x2 ++ uaddw v2.8h, v19.8h, v7.8b ++ sqxtun v4.8b, v4.8h ++ sqxtun v0.8b, v0.8h ++ st1 {v3.8b}, [x3], x2 ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ st1 {v4.8b}, [x3], x2 ++ st1 {v0.8b}, [x3], x2 ++ st1 {v1.8b}, [x3], x2 ++ st1 {v2.8b}, [x3] ++ ret ++endfunc +diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c +index 13dfd74940..a7976fd596 100644 +--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c ++++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c +@@ -21,10 +21,28 @@ + #include "libavutil/attributes.h" + #include "libavutil/cpu.h" + #include "libavutil/aarch64/cpu.h" ++#include "libavutil/intreadwrite.h" + #include "libavcodec/vc1dsp.h" + + #include "config.h" + ++void ff_vc1_inv_trans_8x8_neon(int16_t *block); ++void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++ ++void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++ ++void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); ++void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); ++void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq); ++void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq); ++void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq); ++void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq); ++ + void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + int h, int x, int y); + void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, +@@ -34,14 +52,90 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + int h, int x, int y); + ++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); ++ ++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) ++{ ++ /* Dealing with starting and stopping, and removing escape bytes, are ++ * comparatively less time-sensitive, so are more clearly expressed using ++ * a C wrapper around the assembly inner loop. Note that we assume a ++ * little-endian machine that supports unaligned loads. */ ++ int dsize = 0; ++ while (size >= 4) ++ { ++ int found = 0; ++ while (!found && (((uintptr_t) dst) & 7) && size >= 4) ++ { ++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; ++ if (!found) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ } ++ if (!found) ++ { ++ int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); ++ dst += skip; ++ src += skip; ++ size -= skip; ++ dsize += skip; ++ while (!found && size >= 4) ++ { ++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; ++ if (!found) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ } ++ } ++ if (found) ++ { ++ *dst++ = *src++; ++ *dst++ = *src++; ++ ++src; ++ size -= 3; ++ dsize += 2; ++ } ++ } ++ while (size > 0) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ return dsize; ++} ++ + av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) + { + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { ++ dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon; ++ dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon; ++ dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon; ++ dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon; ++ dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon; ++ dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; ++ dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon; ++ dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; ++ ++ dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; ++ dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; ++ dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; ++ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; ++ dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; ++ dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; ++ + dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon; + dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; + dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; + dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; ++ ++ dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; + } + } +diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S +new file mode 100644 +index 0000000000..9a96c2523c +--- /dev/null ++++ b/libavcodec/aarch64/vc1dsp_neon.S +@@ -0,0 +1,1546 @@ ++/* ++ * VC1 AArch64 NEON optimisations ++ * ++ * Copyright (c) 2022 Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/aarch64/asm.S" ++ ++// VC-1 8x8 inverse transform ++// On entry: ++// x0 -> array of 16-bit inverse transform coefficients, in column-major order ++// On exit: ++// array at x0 updated to hold transformed block; also now held in row-major order ++function ff_vc1_inv_trans_8x8_neon, export=1 ++ ld1 {v1.16b, v2.16b}, [x0], #32 ++ ld1 {v3.16b, v4.16b}, [x0], #32 ++ ld1 {v5.16b, v6.16b}, [x0], #32 ++ shl v1.8h, v1.8h, #2 // 8/2 * src[0] ++ sub x1, x0, #3*32 ++ ld1 {v16.16b, v17.16b}, [x0] ++ shl v7.8h, v2.8h, #4 // 16 * src[8] ++ shl v18.8h, v2.8h, #2 // 4 * src[8] ++ shl v19.8h, v4.8h, #4 // 16 * src[24] ++ ldr d0, .Lcoeffs_it8 ++ shl v5.8h, v5.8h, #2 // 8/2 * src[32] ++ shl v20.8h, v6.8h, #4 // 16 * src[40] ++ shl v21.8h, v6.8h, #2 // 4 * src[40] ++ shl v22.8h, v17.8h, #4 // 16 * src[56] ++ ssra v20.8h, v19.8h, #2 // 4 * src[24] + 16 * src[40] ++ mul v23.8h, v3.8h, v0.h[0] // 6/2 * src[16] ++ sub v19.8h, v19.8h, v21.8h // 16 * src[24] - 4 * src[40] ++ ssra v7.8h, v22.8h, #2 // 16 * src[8] + 4 * src[56] ++ sub v18.8h, v22.8h, v18.8h // - 4 * src[8] + 16 * src[56] ++ shl v3.8h, v3.8h, #3 // 16/2 * src[16] ++ mls v20.8h, v2.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] ++ ssra v1.8h, v1.8h, #1 // 12/2 * src[0] ++ ssra v5.8h, v5.8h, #1 // 12/2 * src[32] ++ mla v7.8h, v4.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] ++ shl v21.8h, v16.8h, #3 // 16/2 * src[48] ++ mls v19.8h, v2.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] ++ sub v2.8h, v23.8h, v21.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48] ++ mla v18.8h, v4.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] ++ add v4.8h, v1.8h, v5.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32] ++ sub v1.8h, v1.8h, v5.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32] ++ mla v3.8h, v16.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] ++ mla v7.8h, v6.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] ++ add v5.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2 ++ sub v16.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2 ++ mla v20.8h, v17.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] ++ add v21.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2 ++ add v22.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2 ++ mls v19.8h, v17.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] ++ sub v17.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2 ++ add v23.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2 ++ mls v18.8h, v6.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] ++ sub v1.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2 ++ sub v2.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2 ++ neg v3.8h, v7.8h // -t1 ++ neg v4.8h, v20.8h // +t2 ++ neg v6.8h, v19.8h // +t3 ++ ssra v22.8h, v7.8h, #1 // (t5 + t1) >> 1 ++ ssra v1.8h, v19.8h, #1 // (t7 - t3) >> 1 ++ neg v7.8h, v18.8h // +t4 ++ ssra v5.8h, v4.8h, #1 // (t6 + t2) >> 1 ++ ssra v16.8h, v6.8h, #1 // (t7 + t3) >> 1 ++ ssra v2.8h, v18.8h, #1 // (t8 - t4) >> 1 ++ ssra v17.8h, v7.8h, #1 // (t8 + t4) >> 1 ++ ssra v21.8h, v20.8h, #1 // (t6 - t2) >> 1 ++ ssra v23.8h, v3.8h, #1 // (t5 - t1) >> 1 ++ srshr v3.8h, v22.8h, #2 // (t5 + t1 + 4) >> 3 ++ srshr v4.8h, v5.8h, #2 // (t6 + t2 + 4) >> 3 ++ srshr v5.8h, v16.8h, #2 // (t7 + t3 + 4) >> 3 ++ srshr v6.8h, v17.8h, #2 // (t8 + t4 + 4) >> 3 ++ srshr v2.8h, v2.8h, #2 // (t8 - t4 + 4) >> 3 ++ srshr v1.8h, v1.8h, #2 // (t7 - t3 + 4) >> 3 ++ srshr v7.8h, v21.8h, #2 // (t6 - t2 + 4) >> 3 ++ srshr v16.8h, v23.8h, #2 // (t5 - t1 + 4) >> 3 ++ trn2 v17.8h, v3.8h, v4.8h ++ trn2 v18.8h, v5.8h, v6.8h ++ trn2 v19.8h, v2.8h, v1.8h ++ trn2 v20.8h, v7.8h, v16.8h ++ trn1 v21.4s, v17.4s, v18.4s ++ trn2 v17.4s, v17.4s, v18.4s ++ trn1 v18.4s, v19.4s, v20.4s ++ trn2 v19.4s, v19.4s, v20.4s ++ trn1 v3.8h, v3.8h, v4.8h ++ trn2 v4.2d, v21.2d, v18.2d ++ trn1 v20.2d, v17.2d, v19.2d ++ trn1 v5.8h, v5.8h, v6.8h ++ trn1 v1.8h, v2.8h, v1.8h ++ trn1 v2.8h, v7.8h, v16.8h ++ trn1 v6.2d, v21.2d, v18.2d ++ trn2 v7.2d, v17.2d, v19.2d ++ shl v16.8h, v20.8h, #4 // 16 * src[24] ++ shl v17.8h, v4.8h, #4 // 16 * src[40] ++ trn1 v18.4s, v3.4s, v5.4s ++ trn1 v19.4s, v1.4s, v2.4s ++ shl v21.8h, v7.8h, #4 // 16 * src[56] ++ shl v22.8h, v6.8h, #2 // 4 * src[8] ++ shl v23.8h, v4.8h, #2 // 4 * src[40] ++ trn2 v3.4s, v3.4s, v5.4s ++ trn2 v1.4s, v1.4s, v2.4s ++ shl v2.8h, v6.8h, #4 // 16 * src[8] ++ sub v5.8h, v16.8h, v23.8h // 16 * src[24] - 4 * src[40] ++ ssra v17.8h, v16.8h, #2 // 4 * src[24] + 16 * src[40] ++ sub v16.8h, v21.8h, v22.8h // - 4 * src[8] + 16 * src[56] ++ trn1 v22.2d, v18.2d, v19.2d ++ trn2 v18.2d, v18.2d, v19.2d ++ trn1 v19.2d, v3.2d, v1.2d ++ ssra v2.8h, v21.8h, #2 // 16 * src[8] + 4 * src[56] ++ mls v17.8h, v6.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] ++ shl v21.8h, v22.8h, #2 // 8/2 * src[0] ++ shl v18.8h, v18.8h, #2 // 8/2 * src[32] ++ mls v5.8h, v6.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] ++ shl v6.8h, v19.8h, #3 // 16/2 * src[16] ++ trn2 v1.2d, v3.2d, v1.2d ++ mla v16.8h, v20.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] ++ ssra v21.8h, v21.8h, #1 // 12/2 * src[0] ++ ssra v18.8h, v18.8h, #1 // 12/2 * src[32] ++ mul v3.8h, v19.8h, v0.h[0] // 6/2 * src[16] ++ shl v19.8h, v1.8h, #3 // 16/2 * src[48] ++ mla v2.8h, v20.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] ++ add v20.8h, v21.8h, v18.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32] ++ mla v6.8h, v1.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] ++ sub v1.8h, v21.8h, v18.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32] ++ sub v3.8h, v3.8h, v19.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48] ++ mla v17.8h, v7.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] ++ mls v5.8h, v7.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] ++ add v7.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2 ++ add v18.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2 ++ mls v16.8h, v4.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] ++ sub v19.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2 ++ neg v21.8h, v17.8h // +t2 ++ mla v2.8h, v4.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] ++ sub v0.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2 ++ neg v4.8h, v5.8h // +t3 ++ sub v22.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2 ++ sub v23.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2 ++ neg v24.8h, v16.8h // +t4 ++ add v6.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2 ++ add v1.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2 ++ ssra v7.8h, v21.8h, #1 // (t6 + t2) >> 1 ++ neg v3.8h, v2.8h // -t1 ++ ssra v18.8h, v2.8h, #1 // (t5 + t1) >> 1 ++ ssra v19.8h, v4.8h, #1 // (t7 + t3) >> 1 ++ ssra v0.8h, v24.8h, #1 // (t8 + t4) >> 1 ++ srsra v23.8h, v16.8h, #1 // (t8 - t4 + 1) >> 1 ++ srsra v22.8h, v5.8h, #1 // (t7 - t3 + 1) >> 1 ++ srsra v1.8h, v17.8h, #1 // (t6 - t2 + 1) >> 1 ++ srsra v6.8h, v3.8h, #1 // (t5 - t1 + 1) >> 1 ++ srshr v2.8h, v18.8h, #6 // (t5 + t1 + 64) >> 7 ++ srshr v3.8h, v7.8h, #6 // (t6 + t2 + 64) >> 7 ++ srshr v4.8h, v19.8h, #6 // (t7 + t3 + 64) >> 7 ++ srshr v5.8h, v0.8h, #6 // (t8 + t4 + 64) >> 7 ++ srshr v16.8h, v23.8h, #6 // (t8 - t4 + 65) >> 7 ++ srshr v17.8h, v22.8h, #6 // (t7 - t3 + 65) >> 7 ++ st1 {v2.16b, v3.16b}, [x1], #32 ++ srshr v0.8h, v1.8h, #6 // (t6 - t2 + 65) >> 7 ++ srshr v1.8h, v6.8h, #6 // (t5 - t1 + 65) >> 7 ++ st1 {v4.16b, v5.16b}, [x1], #32 ++ st1 {v16.16b, v17.16b}, [x1], #32 ++ st1 {v0.16b, v1.16b}, [x1] ++ ret ++endfunc ++ ++// VC-1 8x4 inverse transform ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> array of 16-bit inverse transform coefficients, in row-major order ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_8x4_neon, export=1 ++ ld1 {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32 ++ mov x3, x0 ++ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x2] ++ ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector ++ ld1 {v5.8b}, [x0], x1 ++ trn2 v6.4h, v1.4h, v3.4h ++ trn2 v7.4h, v2.4h, v4.4h ++ trn1 v1.4h, v1.4h, v3.4h ++ trn1 v2.4h, v2.4h, v4.4h ++ trn2 v3.4h, v16.4h, v18.4h ++ trn2 v4.4h, v17.4h, v19.4h ++ trn1 v16.4h, v16.4h, v18.4h ++ trn1 v17.4h, v17.4h, v19.4h ++ ld1 {v18.8b}, [x0], x1 ++ trn1 v19.2s, v6.2s, v3.2s ++ trn2 v3.2s, v6.2s, v3.2s ++ trn1 v6.2s, v7.2s, v4.2s ++ trn2 v4.2s, v7.2s, v4.2s ++ trn1 v7.2s, v1.2s, v16.2s ++ trn1 v20.2s, v2.2s, v17.2s ++ shl v21.4h, v19.4h, #4 // 16 * src[1] ++ trn2 v1.2s, v1.2s, v16.2s ++ shl v16.4h, v3.4h, #4 // 16 * src[3] ++ trn2 v2.2s, v2.2s, v17.2s ++ shl v17.4h, v6.4h, #4 // 16 * src[5] ++ ld1 {v22.8b}, [x0], x1 ++ shl v23.4h, v4.4h, #4 // 16 * src[7] ++ mul v24.4h, v1.4h, v0.h[0] // 6/2 * src[2] ++ ld1 {v25.8b}, [x0] ++ shl v26.4h, v19.4h, #2 // 4 * src[1] ++ shl v27.4h, v6.4h, #2 // 4 * src[5] ++ ssra v21.4h, v23.4h, #2 // 16 * src[1] + 4 * src[7] ++ ssra v17.4h, v16.4h, #2 // 4 * src[3] + 16 * src[5] ++ sub v23.4h, v23.4h, v26.4h // - 4 * src[1] + 16 * src[7] ++ sub v16.4h, v16.4h, v27.4h // 16 * src[3] - 4 * src[5] ++ shl v7.4h, v7.4h, #2 // 8/2 * src[0] ++ shl v20.4h, v20.4h, #2 // 8/2 * src[4] ++ mla v21.4h, v3.4h, v0.h[2] // 16 * src[1] + 15 * src[3] + 4 * src[7] ++ shl v1.4h, v1.4h, #3 // 16/2 * src[2] ++ mls v17.4h, v19.4h, v0.h[2] // - 15 * src[1] + 4 * src[3] + 16 * src[5] ++ ssra v7.4h, v7.4h, #1 // 12/2 * src[0] ++ mls v16.4h, v19.4h, v0.h[1] // - 9 * src[1] + 16 * src[3] - 4 * src[5] ++ ssra v20.4h, v20.4h, #1 // 12/2 * src[4] ++ mla v23.4h, v3.4h, v0.h[1] // - 4 * src[1] + 9 * src[3] + 16 * src[7] ++ shl v3.4h, v2.4h, #3 // 16/2 * src[6] ++ mla v1.4h, v2.4h, v0.h[0] // t3/2 = 16/2 * src[2] + 6/2 * src[6] ++ mla v21.4h, v6.4h, v0.h[1] // t1 = 16 * src[1] + 15 * src[3] + 9 * src[5] + 4 * src[7] ++ mla v17.4h, v4.4h, v0.h[1] // -t2 = - 15 * src[1] + 4 * src[3] + 16 * src[5] + 9 * src[7] ++ sub v2.4h, v24.4h, v3.4h // t4/2 = 6/2 * src[2] - 16/2 * src[6] ++ mls v16.4h, v4.4h, v0.h[2] // -t3 = - 9 * src[1] + 16 * src[3] - 4 * src[5] - 15 * src[7] ++ add v3.4h, v7.4h, v20.4h // t1/2 = 12/2 * src[0] + 12/2 * src[4] ++ mls v23.4h, v6.4h, v0.h[2] // -t4 = - 4 * src[1] + 9 * src[3] - 15 * src[5] + 16 * src[7] ++ sub v4.4h, v7.4h, v20.4h // t2/2 = 12/2 * src[0] - 12/2 * src[4] ++ neg v6.4h, v21.4h // -t1 ++ add v7.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2 ++ sub v19.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2 ++ add v20.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2 ++ sub v24.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2 ++ add v26.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2 ++ add v27.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2 ++ sub v2.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2 ++ sub v1.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2 ++ neg v3.4h, v17.4h // +t2 ++ neg v4.4h, v16.4h // +t3 ++ neg v28.4h, v23.4h // +t4 ++ ssra v7.4h, v21.4h, #1 // (t5 + t1) >> 1 ++ ssra v1.4h, v23.4h, #1 // (t8 - t4) >> 1 ++ ssra v20.4h, v3.4h, #1 // (t6 + t2) >> 1 ++ ssra v24.4h, v4.4h, #1 // (t7 + t3) >> 1 ++ ssra v19.4h, v28.4h, #1 // (t8 + t4) >> 1 ++ ssra v2.4h, v16.4h, #1 // (t7 - t3) >> 1 ++ ssra v27.4h, v17.4h, #1 // (t6 - t2) >> 1 ++ ssra v26.4h, v6.4h, #1 // (t5 - t1) >> 1 ++ trn1 v1.2d, v7.2d, v1.2d ++ trn1 v2.2d, v20.2d, v2.2d ++ trn1 v3.2d, v24.2d, v27.2d ++ trn1 v4.2d, v19.2d, v26.2d ++ srshr v1.8h, v1.8h, #2 // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3 ++ srshr v2.8h, v2.8h, #2 // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3 ++ srshr v3.8h, v3.8h, #2 // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3 ++ srshr v4.8h, v4.8h, #2 // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3 ++ trn2 v6.8h, v1.8h, v2.8h ++ trn1 v1.8h, v1.8h, v2.8h ++ trn2 v2.8h, v3.8h, v4.8h ++ trn1 v3.8h, v3.8h, v4.8h ++ trn2 v4.4s, v6.4s, v2.4s ++ trn1 v7.4s, v1.4s, v3.4s ++ trn2 v1.4s, v1.4s, v3.4s ++ mul v3.8h, v4.8h, v0.h[5] // 22/2 * src[24] ++ trn1 v2.4s, v6.4s, v2.4s ++ mul v4.8h, v4.8h, v0.h[4] // 10/2 * src[24] ++ mul v6.8h, v7.8h, v0.h[6] // 17 * src[0] ++ mul v1.8h, v1.8h, v0.h[6] // 17 * src[16] ++ mls v3.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[8] + 22/2 * src[24] ++ mla v4.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[8] + 10/2 * src[24] ++ add v0.8h, v6.8h, v1.8h // t1 = 17 * src[0] + 17 * src[16] ++ sub v1.8h, v6.8h, v1.8h // t2 = 17 * src[0] - 17 * src[16] ++ neg v2.8h, v3.8h // -t4/2 ++ neg v6.8h, v4.8h // -t3/2 ++ ssra v4.8h, v0.8h, #1 // (t1 + t3) >> 1 ++ ssra v2.8h, v1.8h, #1 // (t2 - t4) >> 1 ++ ssra v3.8h, v1.8h, #1 // (t2 + t4) >> 1 ++ ssra v6.8h, v0.8h, #1 // (t1 - t3) >> 1 ++ srshr v0.8h, v4.8h, #6 // (t1 + t3 + 64) >> 7 ++ srshr v1.8h, v2.8h, #6 // (t2 - t4 + 64) >> 7 ++ srshr v2.8h, v3.8h, #6 // (t2 + t4 + 64) >> 7 ++ srshr v3.8h, v6.8h, #6 // (t1 - t3 + 64) >> 7 ++ uaddw v0.8h, v0.8h, v5.8b ++ uaddw v1.8h, v1.8h, v18.8b ++ uaddw v2.8h, v2.8h, v22.8b ++ uaddw v3.8h, v3.8h, v25.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ st1 {v0.8b}, [x3], x1 ++ st1 {v1.8b}, [x3], x1 ++ st1 {v2.8b}, [x3], x1 ++ st1 {v3.8b}, [x3] ++ ret ++endfunc ++ ++// VC-1 4x8 inverse transform ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients) ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_4x8_neon, export=1 ++ mov x3, #16 ++ ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector ++ mov x4, x0 ++ ld1 {v1.d}[0], [x2], x3 // 00 01 02 03 ++ ld1 {v2.d}[0], [x2], x3 // 10 11 12 13 ++ ld1 {v3.d}[0], [x2], x3 // 20 21 22 23 ++ ld1 {v4.d}[0], [x2], x3 // 30 31 32 33 ++ ld1 {v1.d}[1], [x2], x3 // 40 41 42 43 ++ ld1 {v2.d}[1], [x2], x3 // 50 51 52 53 ++ ld1 {v3.d}[1], [x2], x3 // 60 61 62 63 ++ ld1 {v4.d}[1], [x2] // 70 71 72 73 ++ ld1 {v5.s}[0], [x0], x1 ++ ld1 {v6.s}[0], [x0], x1 ++ ld1 {v7.s}[0], [x0], x1 ++ trn2 v16.8h, v1.8h, v2.8h // 01 11 03 13 41 51 43 53 ++ trn1 v1.8h, v1.8h, v2.8h // 00 10 02 12 40 50 42 52 ++ trn2 v2.8h, v3.8h, v4.8h // 21 31 23 33 61 71 63 73 ++ trn1 v3.8h, v3.8h, v4.8h // 20 30 22 32 60 70 62 72 ++ ld1 {v4.s}[0], [x0], x1 ++ trn2 v17.4s, v16.4s, v2.4s // 03 13 23 33 43 53 63 73 ++ trn1 v18.4s, v1.4s, v3.4s // 00 10 20 30 40 50 60 70 ++ trn1 v2.4s, v16.4s, v2.4s // 01 11 21 31 41 51 61 71 ++ mul v16.8h, v17.8h, v0.h[4] // 10/2 * src[3] ++ ld1 {v5.s}[1], [x0], x1 ++ mul v17.8h, v17.8h, v0.h[5] // 22/2 * src[3] ++ ld1 {v6.s}[1], [x0], x1 ++ trn2 v1.4s, v1.4s, v3.4s // 02 12 22 32 42 52 62 72 ++ mul v3.8h, v18.8h, v0.h[6] // 17 * src[0] ++ ld1 {v7.s}[1], [x0], x1 ++ mul v1.8h, v1.8h, v0.h[6] // 17 * src[2] ++ ld1 {v4.s}[1], [x0] ++ mla v16.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[1] + 10/2 * src[3] ++ mls v17.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[1] + 22/2 * src[3] ++ add v2.8h, v3.8h, v1.8h // t1 = 17 * src[0] + 17 * src[2] ++ sub v1.8h, v3.8h, v1.8h // t2 = 17 * src[0] - 17 * src[2] ++ neg v3.8h, v16.8h // -t3/2 ++ ssra v16.8h, v2.8h, #1 // (t1 + t3) >> 1 ++ neg v18.8h, v17.8h // -t4/2 ++ ssra v17.8h, v1.8h, #1 // (t2 + t4) >> 1 ++ ssra v3.8h, v2.8h, #1 // (t1 - t3) >> 1 ++ ssra v18.8h, v1.8h, #1 // (t2 - t4) >> 1 ++ srshr v1.8h, v16.8h, #2 // (t1 + t3 + 64) >> 3 ++ srshr v2.8h, v17.8h, #2 // (t2 + t4 + 64) >> 3 ++ srshr v3.8h, v3.8h, #2 // (t1 - t3 + 64) >> 3 ++ srshr v16.8h, v18.8h, #2 // (t2 - t4 + 64) >> 3 ++ trn2 v17.8h, v2.8h, v3.8h // 12 13 32 33 52 53 72 73 ++ trn2 v18.8h, v1.8h, v16.8h // 10 11 30 31 50 51 70 71 ++ trn1 v1.8h, v1.8h, v16.8h // 00 01 20 21 40 41 60 61 ++ trn1 v2.8h, v2.8h, v3.8h // 02 03 22 23 42 43 62 63 ++ trn1 v3.4s, v18.4s, v17.4s // 10 11 12 13 50 51 52 53 ++ trn2 v16.4s, v18.4s, v17.4s // 30 31 32 33 70 71 72 73 ++ trn1 v17.4s, v1.4s, v2.4s // 00 01 02 03 40 41 42 43 ++ mov d18, v3.d[1] // 50 51 52 53 ++ shl v19.4h, v3.4h, #4 // 16 * src[8] ++ mov d20, v16.d[1] // 70 71 72 73 ++ shl v21.4h, v16.4h, #4 // 16 * src[24] ++ mov d22, v17.d[1] // 40 41 42 43 ++ shl v23.4h, v3.4h, #2 // 4 * src[8] ++ shl v24.4h, v18.4h, #4 // 16 * src[40] ++ shl v25.4h, v20.4h, #4 // 16 * src[56] ++ shl v26.4h, v18.4h, #2 // 4 * src[40] ++ trn2 v1.4s, v1.4s, v2.4s // 20 21 22 23 60 61 62 63 ++ ssra v24.4h, v21.4h, #2 // 4 * src[24] + 16 * src[40] ++ sub v2.4h, v25.4h, v23.4h // - 4 * src[8] + 16 * src[56] ++ shl v17.4h, v17.4h, #2 // 8/2 * src[0] ++ sub v21.4h, v21.4h, v26.4h // 16 * src[24] - 4 * src[40] ++ shl v22.4h, v22.4h, #2 // 8/2 * src[32] ++ mov d23, v1.d[1] // 60 61 62 63 ++ ssra v19.4h, v25.4h, #2 // 16 * src[8] + 4 * src[56] ++ mul v25.4h, v1.4h, v0.h[0] // 6/2 * src[16] ++ shl v1.4h, v1.4h, #3 // 16/2 * src[16] ++ mls v24.4h, v3.4h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] ++ ssra v17.4h, v17.4h, #1 // 12/2 * src[0] ++ mls v21.4h, v3.4h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] ++ ssra v22.4h, v22.4h, #1 // 12/2 * src[32] ++ mla v2.4h, v16.4h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] ++ shl v3.4h, v23.4h, #3 // 16/2 * src[48] ++ mla v19.4h, v16.4h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] ++ mla v1.4h, v23.4h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] ++ mla v24.4h, v20.4h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] ++ add v16.4h, v17.4h, v22.4h // t1/2 = 12/2 * src[0] + 12/2 * src[32] ++ sub v3.4h, v25.4h, v3.4h // t4/2 = 6/2 * src[16] - 16/2 * src[48] ++ sub v17.4h, v17.4h, v22.4h // t2/2 = 12/2 * src[0] - 12/2 * src[32] ++ mls v21.4h, v20.4h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] ++ mla v19.4h, v18.4h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] ++ add v20.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2 ++ mls v2.4h, v18.4h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] ++ sub v0.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2 ++ add v18.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2 ++ sub v22.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2 ++ neg v23.4h, v24.4h // +t2 ++ sub v25.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2 ++ add v3.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2 ++ neg v17.4h, v21.4h // +t3 ++ sub v26.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2 ++ add v1.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2 ++ neg v16.4h, v19.4h // -t1 ++ neg v27.4h, v2.4h // +t4 ++ ssra v20.4h, v19.4h, #1 // (t5 + t1) >> 1 ++ srsra v0.4h, v2.4h, #1 // (t8 - t4 + 1) >> 1 ++ ssra v18.4h, v23.4h, #1 // (t6 + t2) >> 1 ++ srsra v22.4h, v21.4h, #1 // (t7 - t3 + 1) >> 1 ++ ssra v25.4h, v17.4h, #1 // (t7 + t3) >> 1 ++ srsra v3.4h, v24.4h, #1 // (t6 - t2 + 1) >> 1 ++ ssra v26.4h, v27.4h, #1 // (t8 + t4) >> 1 ++ srsra v1.4h, v16.4h, #1 // (t5 - t1 + 1) >> 1 ++ trn1 v0.2d, v20.2d, v0.2d ++ trn1 v2.2d, v18.2d, v22.2d ++ trn1 v3.2d, v25.2d, v3.2d ++ trn1 v1.2d, v26.2d, v1.2d ++ srshr v0.8h, v0.8h, #6 // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7 ++ srshr v2.8h, v2.8h, #6 // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7 ++ srshr v3.8h, v3.8h, #6 // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7 ++ srshr v1.8h, v1.8h, #6 // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7 ++ uaddw v0.8h, v0.8h, v5.8b ++ uaddw v2.8h, v2.8h, v6.8b ++ uaddw v3.8h, v3.8h, v7.8b ++ uaddw v1.8h, v1.8h, v4.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.s}[0], [x4], x1 ++ st1 {v2.s}[0], [x4], x1 ++ st1 {v3.s}[0], [x4], x1 ++ st1 {v1.s}[0], [x4], x1 ++ st1 {v0.s}[1], [x4], x1 ++ st1 {v2.s}[1], [x4], x1 ++ st1 {v3.s}[1], [x4], x1 ++ st1 {v1.s}[1], [x4] ++ ret ++endfunc ++ ++// VC-1 4x4 inverse transform ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients) ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_4x4_neon, export=1 ++ mov x3, #16 ++ ldr d0, .Lcoeffs_it4 ++ mov x4, x0 ++ ld1 {v1.d}[0], [x2], x3 // 00 01 02 03 ++ ld1 {v2.d}[0], [x2], x3 // 10 11 12 13 ++ ld1 {v3.d}[0], [x2], x3 // 20 21 22 23 ++ ld1 {v4.d}[0], [x2] // 30 31 32 33 ++ ld1 {v5.s}[0], [x0], x1 ++ ld1 {v5.s}[1], [x0], x1 ++ ld1 {v6.s}[0], [x0], x1 ++ trn2 v7.4h, v1.4h, v2.4h // 01 11 03 13 ++ trn1 v1.4h, v1.4h, v2.4h // 00 10 02 12 ++ ld1 {v6.s}[1], [x0] ++ trn2 v2.4h, v3.4h, v4.4h // 21 31 23 33 ++ trn1 v3.4h, v3.4h, v4.4h // 20 30 22 32 ++ trn2 v4.2s, v7.2s, v2.2s // 03 13 23 33 ++ trn1 v16.2s, v1.2s, v3.2s // 00 10 20 30 ++ trn1 v2.2s, v7.2s, v2.2s // 01 11 21 31 ++ trn2 v1.2s, v1.2s, v3.2s // 02 12 22 32 ++ mul v3.4h, v4.4h, v0.h[0] // 10/2 * src[3] ++ mul v4.4h, v4.4h, v0.h[1] // 22/2 * src[3] ++ mul v7.4h, v16.4h, v0.h[2] // 17 * src[0] ++ mul v1.4h, v1.4h, v0.h[2] // 17 * src[2] ++ mla v3.4h, v2.4h, v0.h[1] // t3/2 = 22/2 * src[1] + 10/2 * src[3] ++ mls v4.4h, v2.4h, v0.h[0] // t4/2 = - 10/2 * src[1] + 22/2 * src[3] ++ add v2.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[2] ++ sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[2] ++ neg v7.4h, v3.4h // -t3/2 ++ neg v16.4h, v4.4h // -t4/2 ++ ssra v3.4h, v2.4h, #1 // (t1 + t3) >> 1 ++ ssra v4.4h, v1.4h, #1 // (t2 + t4) >> 1 ++ ssra v16.4h, v1.4h, #1 // (t2 - t4) >> 1 ++ ssra v7.4h, v2.4h, #1 // (t1 - t3) >> 1 ++ srshr v1.4h, v3.4h, #2 // (t1 + t3 + 64) >> 3 ++ srshr v2.4h, v4.4h, #2 // (t2 + t4 + 64) >> 3 ++ srshr v3.4h, v16.4h, #2 // (t2 - t4 + 64) >> 3 ++ srshr v4.4h, v7.4h, #2 // (t1 - t3 + 64) >> 3 ++ trn2 v7.4h, v1.4h, v3.4h // 10 11 30 31 ++ trn1 v1.4h, v1.4h, v3.4h // 00 01 20 21 ++ trn2 v3.4h, v2.4h, v4.4h // 12 13 32 33 ++ trn1 v2.4h, v2.4h, v4.4h // 02 03 22 23 ++ trn2 v4.2s, v7.2s, v3.2s // 30 31 32 33 ++ trn1 v16.2s, v1.2s, v2.2s // 00 01 02 03 ++ trn1 v3.2s, v7.2s, v3.2s // 10 11 12 13 ++ trn2 v1.2s, v1.2s, v2.2s // 20 21 22 23 ++ mul v2.4h, v4.4h, v0.h[1] // 22/2 * src[24] ++ mul v4.4h, v4.4h, v0.h[0] // 10/2 * src[24] ++ mul v7.4h, v16.4h, v0.h[2] // 17 * src[0] ++ mul v1.4h, v1.4h, v0.h[2] // 17 * src[16] ++ mls v2.4h, v3.4h, v0.h[0] // t4/2 = - 10/2 * src[8] + 22/2 * src[24] ++ mla v4.4h, v3.4h, v0.h[1] // t3/2 = 22/2 * src[8] + 10/2 * src[24] ++ add v0.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[16] ++ sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[16] ++ neg v3.4h, v2.4h // -t4/2 ++ neg v7.4h, v4.4h // -t3/2 ++ ssra v4.4h, v0.4h, #1 // (t1 + t3) >> 1 ++ ssra v3.4h, v1.4h, #1 // (t2 - t4) >> 1 ++ ssra v2.4h, v1.4h, #1 // (t2 + t4) >> 1 ++ ssra v7.4h, v0.4h, #1 // (t1 - t3) >> 1 ++ trn1 v0.2d, v4.2d, v3.2d ++ trn1 v1.2d, v2.2d, v7.2d ++ srshr v0.8h, v0.8h, #6 // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7 ++ srshr v1.8h, v1.8h, #6 // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7 ++ uaddw v0.8h, v0.8h, v5.8b ++ uaddw v1.8h, v1.8h, v6.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.s}[0], [x4], x1 ++ st1 {v0.s}[1], [x4], x1 ++ st1 {v1.s}[0], [x4], x1 ++ st1 {v1.s}[1], [x4] ++ ret ++endfunc ++ ++// VC-1 8x8 inverse transform, DC case ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> 16-bit inverse transform DC coefficient ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_8x8_dc_neon, export=1 ++ ldrsh w2, [x2] ++ mov x3, x0 ++ ld1 {v0.8b}, [x0], x1 ++ ld1 {v1.8b}, [x0], x1 ++ ld1 {v2.8b}, [x0], x1 ++ add w2, w2, w2, lsl #1 ++ ld1 {v3.8b}, [x0], x1 ++ ld1 {v4.8b}, [x0], x1 ++ add w2, w2, #1 ++ ld1 {v5.8b}, [x0], x1 ++ asr w2, w2, #1 ++ ld1 {v6.8b}, [x0], x1 ++ add w2, w2, w2, lsl #1 ++ ld1 {v7.8b}, [x0] ++ add w0, w2, #16 ++ asr w0, w0, #5 ++ dup v16.8h, w0 ++ uaddw v0.8h, v16.8h, v0.8b ++ uaddw v1.8h, v16.8h, v1.8b ++ uaddw v2.8h, v16.8h, v2.8b ++ uaddw v3.8h, v16.8h, v3.8b ++ uaddw v4.8h, v16.8h, v4.8b ++ uaddw v5.8h, v16.8h, v5.8b ++ sqxtun v0.8b, v0.8h ++ uaddw v6.8h, v16.8h, v6.8b ++ sqxtun v1.8b, v1.8h ++ uaddw v7.8h, v16.8h, v7.8b ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ sqxtun v4.8b, v4.8h ++ st1 {v0.8b}, [x3], x1 ++ sqxtun v0.8b, v5.8h ++ st1 {v1.8b}, [x3], x1 ++ sqxtun v1.8b, v6.8h ++ st1 {v2.8b}, [x3], x1 ++ sqxtun v2.8b, v7.8h ++ st1 {v3.8b}, [x3], x1 ++ st1 {v4.8b}, [x3], x1 ++ st1 {v0.8b}, [x3], x1 ++ st1 {v1.8b}, [x3], x1 ++ st1 {v2.8b}, [x3] ++ ret ++endfunc ++ ++// VC-1 8x4 inverse transform, DC case ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> 16-bit inverse transform DC coefficient ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_8x4_dc_neon, export=1 ++ ldrsh w2, [x2] ++ mov x3, x0 ++ ld1 {v0.8b}, [x0], x1 ++ ld1 {v1.8b}, [x0], x1 ++ ld1 {v2.8b}, [x0], x1 ++ add w2, w2, w2, lsl #1 ++ ld1 {v3.8b}, [x0] ++ add w0, w2, #1 ++ asr w0, w0, #1 ++ add w0, w0, w0, lsl #4 ++ add w0, w0, #64 ++ asr w0, w0, #7 ++ dup v4.8h, w0 ++ uaddw v0.8h, v4.8h, v0.8b ++ uaddw v1.8h, v4.8h, v1.8b ++ uaddw v2.8h, v4.8h, v2.8b ++ uaddw v3.8h, v4.8h, v3.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ st1 {v0.8b}, [x3], x1 ++ st1 {v1.8b}, [x3], x1 ++ st1 {v2.8b}, [x3], x1 ++ st1 {v3.8b}, [x3] ++ ret ++endfunc ++ ++// VC-1 4x8 inverse transform, DC case ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> 16-bit inverse transform DC coefficient ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_4x8_dc_neon, export=1 ++ ldrsh w2, [x2] ++ mov x3, x0 ++ ld1 {v0.s}[0], [x0], x1 ++ ld1 {v1.s}[0], [x0], x1 ++ ld1 {v2.s}[0], [x0], x1 ++ add w2, w2, w2, lsl #4 ++ ld1 {v3.s}[0], [x0], x1 ++ add w2, w2, #4 ++ asr w2, w2, #3 ++ add w2, w2, w2, lsl #1 ++ ld1 {v0.s}[1], [x0], x1 ++ add w2, w2, #16 ++ asr w2, w2, #5 ++ dup v4.8h, w2 ++ ld1 {v1.s}[1], [x0], x1 ++ ld1 {v2.s}[1], [x0], x1 ++ ld1 {v3.s}[1], [x0] ++ uaddw v0.8h, v4.8h, v0.8b ++ uaddw v1.8h, v4.8h, v1.8b ++ uaddw v2.8h, v4.8h, v2.8b ++ uaddw v3.8h, v4.8h, v3.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ st1 {v0.s}[0], [x3], x1 ++ st1 {v1.s}[0], [x3], x1 ++ st1 {v2.s}[0], [x3], x1 ++ st1 {v3.s}[0], [x3], x1 ++ st1 {v0.s}[1], [x3], x1 ++ st1 {v1.s}[1], [x3], x1 ++ st1 {v2.s}[1], [x3], x1 ++ st1 {v3.s}[1], [x3] ++ ret ++endfunc ++ ++// VC-1 4x4 inverse transform, DC case ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> 16-bit inverse transform DC coefficient ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_4x4_dc_neon, export=1 ++ ldrsh w2, [x2] ++ mov x3, x0 ++ ld1 {v0.s}[0], [x0], x1 ++ ld1 {v1.s}[0], [x0], x1 ++ ld1 {v0.s}[1], [x0], x1 ++ add w2, w2, w2, lsl #4 ++ ld1 {v1.s}[1], [x0] ++ add w0, w2, #4 ++ asr w0, w0, #3 ++ add w0, w0, w0, lsl #4 ++ add w0, w0, #64 ++ asr w0, w0, #7 ++ dup v2.8h, w0 ++ uaddw v0.8h, v2.8h, v0.8b ++ uaddw v1.8h, v2.8h, v1.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.s}[0], [x3], x1 ++ st1 {v1.s}[0], [x3], x1 ++ st1 {v0.s}[1], [x3], x1 ++ st1 {v1.s}[1], [x3] ++ ret ++endfunc ++ ++.align 5 ++.Lcoeffs_it8: ++.quad 0x000F00090003 ++.Lcoeffs_it4: ++.quad 0x0011000B0005 ++.Lcoeffs: ++.quad 0x00050002 ++ ++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of lower block ++// x1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter4_neon, export=1 ++ sub x3, x0, w1, sxtw #2 ++ ldr d0, .Lcoeffs ++ ld1 {v1.s}[0], [x0], x1 // P5 ++ ld1 {v2.s}[0], [x3], x1 // P1 ++ ld1 {v3.s}[0], [x3], x1 // P2 ++ ld1 {v4.s}[0], [x0], x1 // P6 ++ ld1 {v5.s}[0], [x3], x1 // P3 ++ ld1 {v6.s}[0], [x0], x1 // P7 ++ ld1 {v7.s}[0], [x3] // P4 ++ ld1 {v16.s}[0], [x0] // P8 ++ ushll v17.8h, v1.8b, #1 // 2*P5 ++ dup v18.8h, w2 // pq ++ ushll v2.8h, v2.8b, #1 // 2*P1 ++ uxtl v3.8h, v3.8b // P2 ++ uxtl v4.8h, v4.8b // P6 ++ uxtl v19.8h, v5.8b // P3 ++ mls v2.4h, v3.4h, v0.h[1] // 2*P1-5*P2 ++ uxtl v3.8h, v6.8b // P7 ++ mls v17.4h, v4.4h, v0.h[1] // 2*P5-5*P6 ++ ushll v5.8h, v5.8b, #1 // 2*P3 ++ uxtl v6.8h, v7.8b // P4 ++ mla v17.4h, v3.4h, v0.h[1] // 2*P5-5*P6+5*P7 ++ uxtl v3.8h, v16.8b // P8 ++ mla v2.4h, v19.4h, v0.h[1] // 2*P1-5*P2+5*P3 ++ uxtl v1.8h, v1.8b // P5 ++ mls v5.4h, v6.4h, v0.h[1] // 2*P3-5*P4 ++ mls v17.4h, v3.4h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 ++ sub v3.4h, v6.4h, v1.4h // P4-P5 ++ mls v2.4h, v6.4h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 ++ mla v5.4h, v1.4h, v0.h[1] // 2*P3-5*P4+5*P5 ++ mls v5.4h, v4.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 ++ abs v4.4h, v3.4h ++ srshr v7.4h, v17.4h, #3 ++ srshr v2.4h, v2.4h, #3 ++ sshr v4.4h, v4.4h, #1 // clip ++ srshr v5.4h, v5.4h, #3 ++ abs v7.4h, v7.4h // a2 ++ sshr v3.4h, v3.4h, #8 // clip_sign ++ abs v2.4h, v2.4h // a1 ++ cmeq v16.4h, v4.4h, #0 // test clip == 0 ++ abs v17.4h, v5.4h // a0 ++ sshr v5.4h, v5.4h, #8 // a0_sign ++ cmhs v19.4h, v2.4h, v7.4h // test a1 >= a2 ++ cmhs v18.4h, v17.4h, v18.4h // test a0 >= pq ++ sub v3.4h, v3.4h, v5.4h // clip_sign - a0_sign ++ bsl v19.8b, v7.8b, v2.8b // a3 ++ orr v2.8b, v16.8b, v18.8b // test clip == 0 || a0 >= pq ++ uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0 ++ mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 ++ orr v5.8b, v2.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0 ++ mov w0, v5.s[1] // move to gp reg ++ ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ cmhs v5.4h, v0.4h, v4.4h ++ tbnz w0, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered ++ bsl v5.8b, v4.8b, v0.8b // FFMIN(d, clip) ++ bic v0.8b, v5.8b, v2.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ mls v6.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ mla v1.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ sqxtun v0.8b, v6.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.s}[0], [x3], x1 ++ st1 {v1.s}[0], [x3] ++1: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of right block ++// x1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter4_neon, export=1 ++ sub x3, x0, #4 // where to start reading ++ ldr d0, .Lcoeffs ++ ld1 {v1.8b}, [x3], x1 ++ sub x0, x0, #1 // where to start writing ++ ld1 {v2.8b}, [x3], x1 ++ ld1 {v3.8b}, [x3], x1 ++ ld1 {v4.8b}, [x3] ++ dup v5.8h, w2 // pq ++ trn1 v6.8b, v1.8b, v2.8b ++ trn2 v1.8b, v1.8b, v2.8b ++ trn1 v2.8b, v3.8b, v4.8b ++ trn2 v3.8b, v3.8b, v4.8b ++ trn1 v4.4h, v6.4h, v2.4h // P1, P5 ++ trn1 v7.4h, v1.4h, v3.4h // P2, P6 ++ trn2 v2.4h, v6.4h, v2.4h // P3, P7 ++ trn2 v1.4h, v1.4h, v3.4h // P4, P8 ++ ushll v3.8h, v4.8b, #1 // 2*P1, 2*P5 ++ uxtl v6.8h, v7.8b // P2, P6 ++ uxtl v7.8h, v2.8b // P3, P7 ++ uxtl v1.8h, v1.8b // P4, P8 ++ mls v3.8h, v6.8h, v0.h[1] // 2*P1-5*P2, 2*P5-5*P6 ++ ushll v2.8h, v2.8b, #1 // 2*P3, 2*P7 ++ uxtl v4.8h, v4.8b // P1, P5 ++ mla v3.8h, v7.8h, v0.h[1] // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7 ++ mov d6, v6.d[1] // P6 ++ mls v3.8h, v1.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8 ++ mov d4, v4.d[1] // P5 ++ mls v2.4h, v1.4h, v0.h[1] // 2*P3-5*P4 ++ mla v2.4h, v4.4h, v0.h[1] // 2*P3-5*P4+5*P5 ++ sub v7.4h, v1.4h, v4.4h // P4-P5 ++ mls v2.4h, v6.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 ++ srshr v3.8h, v3.8h, #3 ++ abs v6.4h, v7.4h ++ sshr v7.4h, v7.4h, #8 // clip_sign ++ srshr v2.4h, v2.4h, #3 ++ abs v3.8h, v3.8h // a1, a2 ++ sshr v6.4h, v6.4h, #1 // clip ++ mov d16, v3.d[1] // a2 ++ abs v17.4h, v2.4h // a0 ++ cmeq v18.4h, v6.4h, #0 // test clip == 0 ++ sshr v2.4h, v2.4h, #8 // a0_sign ++ cmhs v19.4h, v3.4h, v16.4h // test a1 >= a2 ++ cmhs v5.4h, v17.4h, v5.4h // test a0 >= pq ++ sub v2.4h, v7.4h, v2.4h // clip_sign - a0_sign ++ bsl v19.8b, v16.8b, v3.8b // a3 ++ orr v3.8b, v18.8b, v5.8b // test clip == 0 || a0 >= pq ++ uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0 ++ mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 ++ orr v5.8b, v3.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0 ++ mov w2, v5.s[1] // move to gp reg ++ ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ cmhs v5.4h, v0.4h, v6.4h ++ tbnz w2, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered ++ bsl v5.8b, v6.8b, v0.8b // FFMIN(d, clip) ++ bic v0.8b, v5.8b, v3.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ mla v4.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ mls v1.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ sqxtun v3.8b, v4.8h ++ sqxtun v2.8b, v1.8h ++ st2 {v2.b, v3.b}[0], [x0], x1 ++ st2 {v2.b, v3.b}[1], [x0], x1 ++ st2 {v2.b, v3.b}[2], [x0], x1 ++ st2 {v2.b, v3.b}[3], [x0] ++1: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of lower block ++// x1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter8_neon, export=1 ++ sub x3, x0, w1, sxtw #2 ++ ldr d0, .Lcoeffs ++ ld1 {v1.8b}, [x0], x1 // P5 ++ movi v2.2d, #0x0000ffff00000000 ++ ld1 {v3.8b}, [x3], x1 // P1 ++ ld1 {v4.8b}, [x3], x1 // P2 ++ ld1 {v5.8b}, [x0], x1 // P6 ++ ld1 {v6.8b}, [x3], x1 // P3 ++ ld1 {v7.8b}, [x0], x1 // P7 ++ ushll v16.8h, v1.8b, #1 // 2*P5 ++ ushll v3.8h, v3.8b, #1 // 2*P1 ++ ld1 {v17.8b}, [x3] // P4 ++ uxtl v4.8h, v4.8b // P2 ++ ld1 {v18.8b}, [x0] // P8 ++ uxtl v5.8h, v5.8b // P6 ++ dup v19.8h, w2 // pq ++ uxtl v20.8h, v6.8b // P3 ++ mls v3.8h, v4.8h, v0.h[1] // 2*P1-5*P2 ++ uxtl v4.8h, v7.8b // P7 ++ ushll v6.8h, v6.8b, #1 // 2*P3 ++ mls v16.8h, v5.8h, v0.h[1] // 2*P5-5*P6 ++ uxtl v7.8h, v17.8b // P4 ++ uxtl v17.8h, v18.8b // P8 ++ mla v16.8h, v4.8h, v0.h[1] // 2*P5-5*P6+5*P7 ++ uxtl v1.8h, v1.8b // P5 ++ mla v3.8h, v20.8h, v0.h[1] // 2*P1-5*P2+5*P3 ++ sub v4.8h, v7.8h, v1.8h // P4-P5 ++ mls v6.8h, v7.8h, v0.h[1] // 2*P3-5*P4 ++ mls v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 ++ abs v17.8h, v4.8h ++ sshr v4.8h, v4.8h, #8 // clip_sign ++ mls v3.8h, v7.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 ++ sshr v17.8h, v17.8h, #1 // clip ++ mla v6.8h, v1.8h, v0.h[1] // 2*P3-5*P4+5*P5 ++ srshr v16.8h, v16.8h, #3 ++ mls v6.8h, v5.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 ++ cmeq v5.8h, v17.8h, #0 // test clip == 0 ++ srshr v3.8h, v3.8h, #3 ++ abs v16.8h, v16.8h // a2 ++ abs v3.8h, v3.8h // a1 ++ srshr v6.8h, v6.8h, #3 ++ cmhs v18.8h, v3.8h, v16.8h // test a1 >= a2 ++ abs v20.8h, v6.8h // a0 ++ sshr v6.8h, v6.8h, #8 // a0_sign ++ bsl v18.16b, v16.16b, v3.16b // a3 ++ cmhs v3.8h, v20.8h, v19.8h // test a0 >= pq ++ sub v4.8h, v4.8h, v6.8h // clip_sign - a0_sign ++ uqsub v6.8h, v20.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v16.8h, v18.8h, v20.8h // test a3 >= a0 ++ orr v3.16b, v5.16b, v3.16b // test clip == 0 || a0 >= pq ++ mul v0.8h, v6.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 ++ orr v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0 ++ cmtst v2.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either ++ mov w0, v5.s[1] // move to gp reg ++ ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ mov w2, v5.s[3] ++ orr v2.16b, v3.16b, v2.16b ++ cmhs v3.8h, v0.8h, v17.8h ++ and w0, w0, w2 ++ bsl v3.16b, v17.16b, v0.16b // FFMIN(d, clip) ++ tbnz w0, #0, 1f // none of the 8 pixel pairs should be updated in this case ++ bic v0.16b, v3.16b, v2.16b // set each d to zero if it should not be filtered ++ mls v7.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ mla v1.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ sqxtun v0.8b, v7.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.8b}, [x3], x1 ++ st1 {v1.8b}, [x3] ++1: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of right block ++// x1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter8_neon, export=1 ++ sub x3, x0, #4 // where to start reading ++ ldr d0, .Lcoeffs ++ ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]... ++ sub x0, x0, #1 // where to start writing ++ ld1 {v2.8b}, [x3], x1 ++ add x4, x0, x1, lsl #2 ++ ld1 {v3.8b}, [x3], x1 ++ ld1 {v4.8b}, [x3], x1 ++ ld1 {v5.8b}, [x3], x1 ++ ld1 {v6.8b}, [x3], x1 ++ ld1 {v7.8b}, [x3], x1 ++ trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]... ++ ld1 {v17.8b}, [x3] ++ trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]... ++ trn1 v2.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]... ++ trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]... ++ dup v4.8h, w2 // pq ++ trn1 v18.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]... ++ trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]... ++ trn1 v6.4h, v16.4h, v2.4h // P1[0], P1[1], P1[2], P1[3], P5[0]... ++ trn1 v19.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]... ++ trn1 v20.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]... ++ trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]... ++ trn2 v2.4h, v16.4h, v2.4h // P3[0], P3[1], P3[2], P3[3], P7[0]... ++ trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]... ++ trn1 v3.4h, v18.4h, v20.4h // P1[4], P1[5], P1[6], P1[7], P5[4]... ++ trn1 v16.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]... ++ trn2 v17.4h, v18.4h, v20.4h // P3[4], P3[5], P3[6], P3[7], P7[4]... ++ trn2 v5.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]... ++ trn1 v7.2s, v6.2s, v3.2s // P1 ++ trn1 v18.2s, v19.2s, v16.2s // P2 ++ trn2 v3.2s, v6.2s, v3.2s // P5 ++ trn2 v6.2s, v19.2s, v16.2s // P6 ++ trn1 v16.2s, v2.2s, v17.2s // P3 ++ trn2 v2.2s, v2.2s, v17.2s // P7 ++ ushll v7.8h, v7.8b, #1 // 2*P1 ++ trn1 v17.2s, v1.2s, v5.2s // P4 ++ ushll v19.8h, v3.8b, #1 // 2*P5 ++ trn2 v1.2s, v1.2s, v5.2s // P8 ++ uxtl v5.8h, v18.8b // P2 ++ uxtl v6.8h, v6.8b // P6 ++ uxtl v18.8h, v16.8b // P3 ++ mls v7.8h, v5.8h, v0.h[1] // 2*P1-5*P2 ++ uxtl v2.8h, v2.8b // P7 ++ ushll v5.8h, v16.8b, #1 // 2*P3 ++ mls v19.8h, v6.8h, v0.h[1] // 2*P5-5*P6 ++ uxtl v16.8h, v17.8b // P4 ++ uxtl v1.8h, v1.8b // P8 ++ mla v19.8h, v2.8h, v0.h[1] // 2*P5-5*P6+5*P7 ++ uxtl v2.8h, v3.8b // P5 ++ mla v7.8h, v18.8h, v0.h[1] // 2*P1-5*P2+5*P3 ++ sub v3.8h, v16.8h, v2.8h // P4-P5 ++ mls v5.8h, v16.8h, v0.h[1] // 2*P3-5*P4 ++ mls v19.8h, v1.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 ++ abs v1.8h, v3.8h ++ sshr v3.8h, v3.8h, #8 // clip_sign ++ mls v7.8h, v16.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 ++ sshr v1.8h, v1.8h, #1 // clip ++ mla v5.8h, v2.8h, v0.h[1] // 2*P3-5*P4+5*P5 ++ srshr v17.8h, v19.8h, #3 ++ mls v5.8h, v6.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 ++ cmeq v6.8h, v1.8h, #0 // test clip == 0 ++ srshr v7.8h, v7.8h, #3 ++ abs v17.8h, v17.8h // a2 ++ abs v7.8h, v7.8h // a1 ++ srshr v5.8h, v5.8h, #3 ++ cmhs v18.8h, v7.8h, v17.8h // test a1 >= a2 ++ abs v19.8h, v5.8h // a0 ++ sshr v5.8h, v5.8h, #8 // a0_sign ++ bsl v18.16b, v17.16b, v7.16b // a3 ++ cmhs v4.8h, v19.8h, v4.8h // test a0 >= pq ++ sub v3.8h, v3.8h, v5.8h // clip_sign - a0_sign ++ uqsub v5.8h, v19.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v7.8h, v18.8h, v19.8h // test a3 >= a0 ++ orr v4.16b, v6.16b, v4.16b // test clip == 0 || a0 >= pq ++ mul v0.8h, v5.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 ++ orr v5.16b, v4.16b, v7.16b // test clip == 0 || a0 >= pq || a3 >= a0 ++ mov w2, v5.s[1] // move to gp reg ++ ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ mov w3, v5.s[3] ++ cmhs v5.8h, v0.8h, v1.8h ++ and w5, w2, w3 ++ bsl v5.16b, v1.16b, v0.16b // FFMIN(d, clip) ++ tbnz w5, #0, 2f // none of the 8 pixel pairs should be updated in this case ++ bic v0.16b, v5.16b, v4.16b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ mla v2.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ mls v16.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ sqxtun v1.8b, v2.8h ++ sqxtun v0.8b, v16.8h ++ tbnz w2, #0, 1f // none of the first 4 pixel pairs should be updated if so ++ st2 {v0.b, v1.b}[0], [x0], x1 ++ st2 {v0.b, v1.b}[1], [x0], x1 ++ st2 {v0.b, v1.b}[2], [x0], x1 ++ st2 {v0.b, v1.b}[3], [x0] ++1: tbnz w3, #0, 2f // none of the second 4 pixel pairs should be updated if so ++ st2 {v0.b, v1.b}[4], [x4], x1 ++ st2 {v0.b, v1.b}[5], [x4], x1 ++ st2 {v0.b, v1.b}[6], [x4], x1 ++ st2 {v0.b, v1.b}[7], [x4] ++2: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of lower block ++// x1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter16_neon, export=1 ++ sub x3, x0, w1, sxtw #2 ++ ldr d0, .Lcoeffs ++ ld1 {v1.16b}, [x0], x1 // P5 ++ movi v2.2d, #0x0000ffff00000000 ++ ld1 {v3.16b}, [x3], x1 // P1 ++ ld1 {v4.16b}, [x3], x1 // P2 ++ ld1 {v5.16b}, [x0], x1 // P6 ++ ld1 {v6.16b}, [x3], x1 // P3 ++ ld1 {v7.16b}, [x0], x1 // P7 ++ ushll v16.8h, v1.8b, #1 // 2*P5[0..7] ++ ushll v17.8h, v3.8b, #1 // 2*P1[0..7] ++ ld1 {v18.16b}, [x3] // P4 ++ uxtl v19.8h, v4.8b // P2[0..7] ++ ld1 {v20.16b}, [x0] // P8 ++ uxtl v21.8h, v5.8b // P6[0..7] ++ dup v22.8h, w2 // pq ++ ushll2 v3.8h, v3.16b, #1 // 2*P1[8..15] ++ mls v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7] ++ ushll2 v19.8h, v1.16b, #1 // 2*P5[8..15] ++ uxtl2 v4.8h, v4.16b // P2[8..15] ++ mls v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7] ++ uxtl2 v5.8h, v5.16b // P6[8..15] ++ uxtl v23.8h, v6.8b // P3[0..7] ++ uxtl v24.8h, v7.8b // P7[0..7] ++ mls v3.8h, v4.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15] ++ ushll v4.8h, v6.8b, #1 // 2*P3[0..7] ++ uxtl v25.8h, v18.8b // P4[0..7] ++ mls v19.8h, v5.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15] ++ uxtl2 v26.8h, v6.16b // P3[8..15] ++ mla v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] ++ uxtl2 v7.8h, v7.16b // P7[8..15] ++ ushll2 v6.8h, v6.16b, #1 // 2*P3[8..15] ++ mla v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] ++ uxtl2 v18.8h, v18.16b // P4[8..15] ++ uxtl v23.8h, v20.8b // P8[0..7] ++ mls v4.8h, v25.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7] ++ uxtl v24.8h, v1.8b // P5[0..7] ++ uxtl2 v20.8h, v20.16b // P8[8..15] ++ mla v3.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] ++ uxtl2 v1.8h, v1.16b // P5[8..15] ++ sub v26.8h, v25.8h, v24.8h // P4[0..7]-P5[0..7] ++ mla v19.8h, v7.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] ++ sub v7.8h, v18.8h, v1.8h // P4[8..15]-P5[8..15] ++ mls v6.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15] ++ abs v27.8h, v26.8h ++ sshr v26.8h, v26.8h, #8 // clip_sign[0..7] ++ mls v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] ++ abs v28.8h, v7.8h ++ sshr v27.8h, v27.8h, #1 // clip[0..7] ++ mls v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] ++ sshr v7.8h, v7.8h, #8 // clip_sign[8..15] ++ sshr v23.8h, v28.8h, #1 // clip[8..15] ++ mla v4.8h, v24.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] ++ cmeq v28.8h, v27.8h, #0 // test clip[0..7] == 0 ++ srshr v17.8h, v17.8h, #3 ++ mls v3.8h, v18.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] ++ cmeq v29.8h, v23.8h, #0 // test clip[8..15] == 0 ++ srshr v16.8h, v16.8h, #3 ++ mls v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] ++ abs v17.8h, v17.8h // a1[0..7] ++ mla v6.8h, v1.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] ++ srshr v3.8h, v3.8h, #3 ++ mls v4.8h, v21.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] ++ abs v16.8h, v16.8h // a2[0..7] ++ srshr v19.8h, v19.8h, #3 ++ mls v6.8h, v5.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] ++ cmhs v5.8h, v17.8h, v16.8h // test a1[0..7] >= a2[0..7] ++ abs v3.8h, v3.8h // a1[8..15] ++ srshr v4.8h, v4.8h, #3 ++ abs v19.8h, v19.8h // a2[8..15] ++ bsl v5.16b, v16.16b, v17.16b // a3[0..7] ++ srshr v6.8h, v6.8h, #3 ++ cmhs v16.8h, v3.8h, v19.8h // test a1[8..15] >= a2[8.15] ++ abs v17.8h, v4.8h // a0[0..7] ++ sshr v4.8h, v4.8h, #8 // a0_sign[0..7] ++ bsl v16.16b, v19.16b, v3.16b // a3[8..15] ++ uqsub v3.8h, v17.8h, v5.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ abs v19.8h, v6.8h // a0[8..15] ++ cmhs v20.8h, v17.8h, v22.8h // test a0[0..7] >= pq ++ cmhs v5.8h, v5.8h, v17.8h // test a3[0..7] >= a0[0..7] ++ sub v4.8h, v26.8h, v4.8h // clip_sign[0..7] - a0_sign[0..7] ++ sshr v6.8h, v6.8h, #8 // a0_sign[8..15] ++ mul v3.8h, v3.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 ++ uqsub v17.8h, v19.8h, v16.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ orr v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq ++ cmhs v21.8h, v19.8h, v22.8h // test a0[8..15] >= pq ++ cmhs v16.8h, v16.8h, v19.8h // test a3[8..15] >= a0[8..15] ++ mul v0.8h, v17.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 ++ sub v6.8h, v7.8h, v6.8h // clip_sign[8..15] - a0_sign[8..15] ++ orr v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] ++ ushr v3.8h, v3.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 ++ orr v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq ++ cmtst v17.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either ++ mov w0, v5.s[1] // move to gp reg ++ cmhs v19.8h, v3.8h, v27.8h ++ ushr v0.8h, v0.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 ++ mov w2, v5.s[3] ++ orr v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] ++ orr v16.16b, v20.16b, v17.16b ++ bsl v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7]) ++ cmtst v2.2d, v5.2d, v2.2d ++ cmhs v3.8h, v0.8h, v23.8h ++ mov w4, v5.s[1] ++ mov w5, v5.s[3] ++ and w0, w0, w2 ++ bic v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) ++ orr v2.16b, v7.16b, v2.16b ++ bsl v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15]) ++ mls v25.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7] ++ and w2, w4, w5 ++ bic v0.16b, v3.16b, v2.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) ++ mla v24.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7] ++ and w0, w0, w2 ++ mls v18.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15] ++ sqxtun v2.8b, v25.8h ++ tbnz w0, #0, 1f // none of the 16 pixel pairs should be updated in this case ++ mla v1.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15] ++ sqxtun v0.8b, v24.8h ++ sqxtun2 v2.16b, v18.8h ++ sqxtun2 v0.16b, v1.8h ++ st1 {v2.16b}, [x3], x1 ++ st1 {v0.16b}, [x3] ++1: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of right block ++// x1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter16_neon, export=1 ++ sub x3, x0, #4 // where to start reading ++ ldr d0, .Lcoeffs ++ ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]... ++ sub x0, x0, #1 // where to start writing ++ ld1 {v2.8b}, [x3], x1 ++ add x4, x0, x1, lsl #3 ++ ld1 {v3.8b}, [x3], x1 ++ add x5, x0, x1, lsl #2 ++ ld1 {v4.8b}, [x3], x1 ++ add x6, x4, x1, lsl #2 ++ ld1 {v5.8b}, [x3], x1 ++ ld1 {v6.8b}, [x3], x1 ++ ld1 {v7.8b}, [x3], x1 ++ trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]... ++ ld1 {v17.8b}, [x3], x1 ++ trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]... ++ ld1 {v2.8b}, [x3], x1 ++ trn1 v18.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]... ++ ld1 {v19.8b}, [x3], x1 ++ trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]... ++ ld1 {v4.8b}, [x3], x1 ++ trn1 v20.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]... ++ ld1 {v21.8b}, [x3], x1 ++ trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]... ++ ld1 {v6.8b}, [x3], x1 ++ trn1 v22.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]... ++ ld1 {v23.8b}, [x3], x1 ++ trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]... ++ ld1 {v17.8b}, [x3], x1 ++ trn1 v24.8b, v2.8b, v19.8b // P1[8], P1[9], P3[8]... ++ ld1 {v25.8b}, [x3] ++ trn2 v2.8b, v2.8b, v19.8b // P2[8], P2[9], P4[8]... ++ trn1 v19.4h, v16.4h, v18.4h // P1[0], P1[1], P1[2], P1[3], P5[0]... ++ trn1 v26.8b, v4.8b, v21.8b // P1[10], P1[11], P3[10]... ++ trn2 v4.8b, v4.8b, v21.8b // P2[10], P2[11], P4[10]... ++ trn1 v21.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]... ++ trn1 v27.4h, v20.4h, v22.4h // P1[4], P1[5], P1[6], P1[7], P5[4]... ++ trn1 v28.8b, v6.8b, v23.8b // P1[12], P1[13], P3[12]... ++ trn2 v6.8b, v6.8b, v23.8b // P2[12], P2[13], P4[12]... ++ trn1 v23.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]... ++ trn1 v29.4h, v24.4h, v26.4h // P1[8], P1[9], P1[10], P1[11], P5[8]... ++ trn1 v30.8b, v17.8b, v25.8b // P1[14], P1[15], P3[14]... ++ trn2 v17.8b, v17.8b, v25.8b // P2[14], P2[15], P4[14]... ++ trn1 v25.4h, v2.4h, v4.4h // P2[8], P2[9], P2[10], P2[11], P6[8]... ++ trn1 v31.2s, v19.2s, v27.2s // P1[0..7] ++ trn2 v19.2s, v19.2s, v27.2s // P5[0..7] ++ trn1 v27.2s, v21.2s, v23.2s // P2[0..7] ++ trn2 v21.2s, v21.2s, v23.2s // P6[0..7] ++ trn1 v23.4h, v28.4h, v30.4h // P1[12], P1[13], P1[14], P1[15], P5[12]... ++ trn2 v16.4h, v16.4h, v18.4h // P3[0], P3[1], P3[2], P3[3], P7[0]... ++ trn1 v18.4h, v6.4h, v17.4h // P2[12], P2[13], P2[14], P2[15], P6[12]... ++ trn2 v20.4h, v20.4h, v22.4h // P3[4], P3[5], P3[6], P3[7], P7[4]... ++ trn2 v22.4h, v24.4h, v26.4h // P3[8], P3[9], P3[10], P3[11], P7[8]... ++ trn1 v24.2s, v29.2s, v23.2s // P1[8..15] ++ trn2 v23.2s, v29.2s, v23.2s // P5[8..15] ++ trn1 v26.2s, v25.2s, v18.2s // P2[8..15] ++ trn2 v18.2s, v25.2s, v18.2s // P6[8..15] ++ trn2 v25.4h, v28.4h, v30.4h // P3[12], P3[13], P3[14], P3[15], P7[12]... ++ trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]... ++ trn2 v3.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]... ++ trn2 v2.4h, v2.4h, v4.4h // P4[8], P4[9], P4[10], P4[11], P8[8]... ++ trn2 v4.4h, v6.4h, v17.4h // P4[12], P4[13], P4[14], P4[15], P8[12]... ++ ushll v5.8h, v31.8b, #1 // 2*P1[0..7] ++ ushll v6.8h, v19.8b, #1 // 2*P5[0..7] ++ trn1 v7.2s, v16.2s, v20.2s // P3[0..7] ++ uxtl v17.8h, v27.8b // P2[0..7] ++ trn2 v16.2s, v16.2s, v20.2s // P7[0..7] ++ uxtl v20.8h, v21.8b // P6[0..7] ++ trn1 v21.2s, v22.2s, v25.2s // P3[8..15] ++ ushll v24.8h, v24.8b, #1 // 2*P1[8..15] ++ trn2 v22.2s, v22.2s, v25.2s // P7[8..15] ++ ushll v25.8h, v23.8b, #1 // 2*P5[8..15] ++ trn1 v27.2s, v1.2s, v3.2s // P4[0..7] ++ uxtl v26.8h, v26.8b // P2[8..15] ++ mls v5.8h, v17.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7] ++ uxtl v17.8h, v18.8b // P6[8..15] ++ mls v6.8h, v20.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7] ++ trn1 v18.2s, v2.2s, v4.2s // P4[8..15] ++ uxtl v28.8h, v7.8b // P3[0..7] ++ mls v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15] ++ uxtl v16.8h, v16.8b // P7[0..7] ++ uxtl v26.8h, v21.8b // P3[8..15] ++ mls v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15] ++ uxtl v22.8h, v22.8b // P7[8..15] ++ ushll v7.8h, v7.8b, #1 // 2*P3[0..7] ++ uxtl v27.8h, v27.8b // P4[0..7] ++ trn2 v1.2s, v1.2s, v3.2s // P8[0..7] ++ ushll v3.8h, v21.8b, #1 // 2*P3[8..15] ++ trn2 v2.2s, v2.2s, v4.2s // P8[8..15] ++ uxtl v4.8h, v18.8b // P4[8..15] ++ mla v5.8h, v28.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] ++ uxtl v1.8h, v1.8b // P8[0..7] ++ mla v6.8h, v16.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] ++ uxtl v2.8h, v2.8b // P8[8..15] ++ uxtl v16.8h, v19.8b // P5[0..7] ++ mla v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] ++ uxtl v18.8h, v23.8b // P5[8..15] ++ dup v19.8h, w2 // pq ++ mla v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] ++ sub v21.8h, v27.8h, v16.8h // P4[0..7]-P5[0..7] ++ sub v22.8h, v4.8h, v18.8h // P4[8..15]-P5[8..15] ++ mls v7.8h, v27.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7] ++ abs v23.8h, v21.8h ++ mls v3.8h, v4.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15] ++ abs v26.8h, v22.8h ++ sshr v21.8h, v21.8h, #8 // clip_sign[0..7] ++ mls v5.8h, v27.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] ++ sshr v23.8h, v23.8h, #1 // clip[0..7] ++ sshr v26.8h, v26.8h, #1 // clip[8..15] ++ mls v6.8h, v1.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] ++ sshr v1.8h, v22.8h, #8 // clip_sign[8..15] ++ cmeq v22.8h, v23.8h, #0 // test clip[0..7] == 0 ++ mls v24.8h, v4.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] ++ cmeq v28.8h, v26.8h, #0 // test clip[8..15] == 0 ++ srshr v5.8h, v5.8h, #3 ++ mls v25.8h, v2.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] ++ srshr v2.8h, v6.8h, #3 ++ mla v7.8h, v16.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] ++ srshr v6.8h, v24.8h, #3 ++ mla v3.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] ++ abs v5.8h, v5.8h // a1[0..7] ++ srshr v24.8h, v25.8h, #3 ++ mls v3.8h, v17.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] ++ abs v2.8h, v2.8h // a2[0..7] ++ abs v6.8h, v6.8h // a1[8..15] ++ mls v7.8h, v20.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] ++ abs v17.8h, v24.8h // a2[8..15] ++ cmhs v20.8h, v5.8h, v2.8h // test a1[0..7] >= a2[0..7] ++ srshr v3.8h, v3.8h, #3 ++ cmhs v24.8h, v6.8h, v17.8h // test a1[8..15] >= a2[8.15] ++ srshr v7.8h, v7.8h, #3 ++ bsl v20.16b, v2.16b, v5.16b // a3[0..7] ++ abs v2.8h, v3.8h // a0[8..15] ++ sshr v3.8h, v3.8h, #8 // a0_sign[8..15] ++ bsl v24.16b, v17.16b, v6.16b // a3[8..15] ++ abs v5.8h, v7.8h // a0[0..7] ++ sshr v6.8h, v7.8h, #8 // a0_sign[0..7] ++ cmhs v7.8h, v2.8h, v19.8h // test a0[8..15] >= pq ++ sub v1.8h, v1.8h, v3.8h // clip_sign[8..15] - a0_sign[8..15] ++ uqsub v3.8h, v2.8h, v24.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v2.8h, v24.8h, v2.8h // test a3[8..15] >= a0[8..15] ++ uqsub v17.8h, v5.8h, v20.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v19.8h, v5.8h, v19.8h // test a0[0..7] >= pq ++ orr v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq ++ sub v6.8h, v21.8h, v6.8h // clip_sign[0..7] - a0_sign[0..7] ++ mul v3.8h, v3.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 ++ cmhs v5.8h, v20.8h, v5.8h // test a3[0..7] >= a0[0..7] ++ orr v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq ++ mul v0.8h, v17.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 ++ orr v2.16b, v7.16b, v2.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] ++ orr v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] ++ ushr v3.8h, v3.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 ++ mov w7, v2.s[1] ++ mov w8, v2.s[3] ++ ushr v0.8h, v0.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 ++ mov w2, v5.s[1] // move to gp reg ++ cmhs v2.8h, v3.8h, v26.8h ++ mov w3, v5.s[3] ++ cmhs v5.8h, v0.8h, v23.8h ++ bsl v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15]) ++ and w9, w7, w8 ++ bsl v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7]) ++ and w10, w2, w3 ++ bic v0.16b, v2.16b, v7.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) ++ and w9, w10, w9 ++ bic v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) ++ mls v4.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4 ++ tbnz w9, #0, 4f // none of the 16 pixel pairs should be updated in this case ++ mls v27.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4 ++ mla v16.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5 ++ sqxtun v2.8b, v4.8h ++ mla v18.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5 ++ sqxtun v0.8b, v27.8h ++ sqxtun v1.8b, v16.8h ++ sqxtun v3.8b, v18.8h ++ tbnz w2, #0, 1f ++ st2 {v0.b, v1.b}[0], [x0], x1 ++ st2 {v0.b, v1.b}[1], [x0], x1 ++ st2 {v0.b, v1.b}[2], [x0], x1 ++ st2 {v0.b, v1.b}[3], [x0] ++1: tbnz w3, #0, 2f ++ st2 {v0.b, v1.b}[4], [x5], x1 ++ st2 {v0.b, v1.b}[5], [x5], x1 ++ st2 {v0.b, v1.b}[6], [x5], x1 ++ st2 {v0.b, v1.b}[7], [x5] ++2: tbnz w7, #0, 3f ++ st2 {v2.b, v3.b}[0], [x4], x1 ++ st2 {v2.b, v3.b}[1], [x4], x1 ++ st2 {v2.b, v3.b}[2], [x4], x1 ++ st2 {v2.b, v3.b}[3], [x4] ++3: tbnz w8, #0, 4f ++ st2 {v2.b, v3.b}[4], [x6], x1 ++ st2 {v2.b, v3.b}[5], [x6], x1 ++ st2 {v2.b, v3.b}[6], [x6], x1 ++ st2 {v2.b, v3.b}[7], [x6] ++4: ret ++endfunc ++ ++// Copy at most the specified number of bytes from source to destination buffer, ++// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence ++// On entry: ++// x0 -> source buffer ++// w1 = max number of bytes to copy ++// x2 -> destination buffer, optimally 8-byte aligned ++// On exit: ++// w0 = number of bytes not copied ++function ff_vc1_unescape_buffer_helper_neon, export=1 ++ // Offset by 80 to screen out cases that are too short for us to handle, ++ // and also make it easy to test for loop termination, or to determine ++ // whether we need an odd number of half-iterations of the loop. ++ subs w1, w1, #80 ++ b.mi 90f ++ ++ // Set up useful constants ++ movi v20.4s, #3, lsl #24 ++ movi v21.4s, #3, lsl #16 ++ ++ tst w1, #32 ++ b.ne 1f ++ ++ ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48 ++ ext v25.16b, v0.16b, v1.16b, #1 ++ ext v26.16b, v0.16b, v1.16b, #2 ++ ext v27.16b, v0.16b, v1.16b, #3 ++ ext v29.16b, v1.16b, v2.16b, #1 ++ ext v30.16b, v1.16b, v2.16b, #2 ++ ext v31.16b, v1.16b, v2.16b, #3 ++ bic v24.16b, v0.16b, v20.16b ++ bic v25.16b, v25.16b, v20.16b ++ bic v26.16b, v26.16b, v20.16b ++ bic v27.16b, v27.16b, v20.16b ++ bic v28.16b, v1.16b, v20.16b ++ bic v29.16b, v29.16b, v20.16b ++ bic v30.16b, v30.16b, v20.16b ++ bic v31.16b, v31.16b, v20.16b ++ eor v24.16b, v24.16b, v21.16b ++ eor v25.16b, v25.16b, v21.16b ++ eor v26.16b, v26.16b, v21.16b ++ eor v27.16b, v27.16b, v21.16b ++ eor v28.16b, v28.16b, v21.16b ++ eor v29.16b, v29.16b, v21.16b ++ eor v30.16b, v30.16b, v21.16b ++ eor v31.16b, v31.16b, v21.16b ++ cmeq v24.4s, v24.4s, #0 ++ cmeq v25.4s, v25.4s, #0 ++ cmeq v26.4s, v26.4s, #0 ++ cmeq v27.4s, v27.4s, #0 ++ add w1, w1, #32 ++ b 3f ++ ++1: ld1 {v3.16b, v4.16b, v5.16b}, [x0], #48 ++ ext v25.16b, v3.16b, v4.16b, #1 ++ ext v26.16b, v3.16b, v4.16b, #2 ++ ext v27.16b, v3.16b, v4.16b, #3 ++ ext v29.16b, v4.16b, v5.16b, #1 ++ ext v30.16b, v4.16b, v5.16b, #2 ++ ext v31.16b, v4.16b, v5.16b, #3 ++ bic v24.16b, v3.16b, v20.16b ++ bic v25.16b, v25.16b, v20.16b ++ bic v26.16b, v26.16b, v20.16b ++ bic v27.16b, v27.16b, v20.16b ++ bic v28.16b, v4.16b, v20.16b ++ bic v29.16b, v29.16b, v20.16b ++ bic v30.16b, v30.16b, v20.16b ++ bic v31.16b, v31.16b, v20.16b ++ eor v24.16b, v24.16b, v21.16b ++ eor v25.16b, v25.16b, v21.16b ++ eor v26.16b, v26.16b, v21.16b ++ eor v27.16b, v27.16b, v21.16b ++ eor v28.16b, v28.16b, v21.16b ++ eor v29.16b, v29.16b, v21.16b ++ eor v30.16b, v30.16b, v21.16b ++ eor v31.16b, v31.16b, v21.16b ++ cmeq v24.4s, v24.4s, #0 ++ cmeq v25.4s, v25.4s, #0 ++ cmeq v26.4s, v26.4s, #0 ++ cmeq v27.4s, v27.4s, #0 ++ // Drop through... ++2: mov v0.16b, v5.16b ++ ld1 {v1.16b, v2.16b}, [x0], #32 ++ cmeq v28.4s, v28.4s, #0 ++ cmeq v29.4s, v29.4s, #0 ++ cmeq v30.4s, v30.4s, #0 ++ cmeq v31.4s, v31.4s, #0 ++ orr v24.16b, v24.16b, v25.16b ++ orr v26.16b, v26.16b, v27.16b ++ orr v28.16b, v28.16b, v29.16b ++ orr v30.16b, v30.16b, v31.16b ++ ext v25.16b, v0.16b, v1.16b, #1 ++ orr v22.16b, v24.16b, v26.16b ++ ext v26.16b, v0.16b, v1.16b, #2 ++ ext v27.16b, v0.16b, v1.16b, #3 ++ ext v29.16b, v1.16b, v2.16b, #1 ++ orr v23.16b, v28.16b, v30.16b ++ ext v30.16b, v1.16b, v2.16b, #2 ++ ext v31.16b, v1.16b, v2.16b, #3 ++ bic v24.16b, v0.16b, v20.16b ++ bic v25.16b, v25.16b, v20.16b ++ bic v26.16b, v26.16b, v20.16b ++ orr v22.16b, v22.16b, v23.16b ++ bic v27.16b, v27.16b, v20.16b ++ bic v28.16b, v1.16b, v20.16b ++ bic v29.16b, v29.16b, v20.16b ++ bic v30.16b, v30.16b, v20.16b ++ bic v31.16b, v31.16b, v20.16b ++ addv s22, v22.4s ++ eor v24.16b, v24.16b, v21.16b ++ eor v25.16b, v25.16b, v21.16b ++ eor v26.16b, v26.16b, v21.16b ++ eor v27.16b, v27.16b, v21.16b ++ eor v28.16b, v28.16b, v21.16b ++ mov w3, v22.s[0] ++ eor v29.16b, v29.16b, v21.16b ++ eor v30.16b, v30.16b, v21.16b ++ eor v31.16b, v31.16b, v21.16b ++ cmeq v24.4s, v24.4s, #0 ++ cmeq v25.4s, v25.4s, #0 ++ cmeq v26.4s, v26.4s, #0 ++ cmeq v27.4s, v27.4s, #0 ++ cbnz w3, 90f ++ st1 {v3.16b, v4.16b}, [x2], #32 ++3: mov v3.16b, v2.16b ++ ld1 {v4.16b, v5.16b}, [x0], #32 ++ cmeq v28.4s, v28.4s, #0 ++ cmeq v29.4s, v29.4s, #0 ++ cmeq v30.4s, v30.4s, #0 ++ cmeq v31.4s, v31.4s, #0 ++ orr v24.16b, v24.16b, v25.16b ++ orr v26.16b, v26.16b, v27.16b ++ orr v28.16b, v28.16b, v29.16b ++ orr v30.16b, v30.16b, v31.16b ++ ext v25.16b, v3.16b, v4.16b, #1 ++ orr v22.16b, v24.16b, v26.16b ++ ext v26.16b, v3.16b, v4.16b, #2 ++ ext v27.16b, v3.16b, v4.16b, #3 ++ ext v29.16b, v4.16b, v5.16b, #1 ++ orr v23.16b, v28.16b, v30.16b ++ ext v30.16b, v4.16b, v5.16b, #2 ++ ext v31.16b, v4.16b, v5.16b, #3 ++ bic v24.16b, v3.16b, v20.16b ++ bic v25.16b, v25.16b, v20.16b ++ bic v26.16b, v26.16b, v20.16b ++ orr v22.16b, v22.16b, v23.16b ++ bic v27.16b, v27.16b, v20.16b ++ bic v28.16b, v4.16b, v20.16b ++ bic v29.16b, v29.16b, v20.16b ++ bic v30.16b, v30.16b, v20.16b ++ bic v31.16b, v31.16b, v20.16b ++ addv s22, v22.4s ++ eor v24.16b, v24.16b, v21.16b ++ eor v25.16b, v25.16b, v21.16b ++ eor v26.16b, v26.16b, v21.16b ++ eor v27.16b, v27.16b, v21.16b ++ eor v28.16b, v28.16b, v21.16b ++ mov w3, v22.s[0] ++ eor v29.16b, v29.16b, v21.16b ++ eor v30.16b, v30.16b, v21.16b ++ eor v31.16b, v31.16b, v21.16b ++ cmeq v24.4s, v24.4s, #0 ++ cmeq v25.4s, v25.4s, #0 ++ cmeq v26.4s, v26.4s, #0 ++ cmeq v27.4s, v27.4s, #0 ++ cbnz w3, 91f ++ st1 {v0.16b, v1.16b}, [x2], #32 ++ subs w1, w1, #64 ++ b.pl 2b ++ ++90: add w0, w1, #80 ++ ret ++ ++91: sub w1, w1, #32 ++ b 90b ++endfunc +diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c +index 2e9a3581de..d9571b437f 100644 +--- a/libavcodec/allcodecs.c ++++ b/libavcodec/allcodecs.c +@@ -153,6 +153,7 @@ extern AVCodec ff_hap_decoder; + extern AVCodec ff_hevc_decoder; + extern AVCodec ff_hevc_qsv_decoder; + extern AVCodec ff_hevc_rkmpp_decoder; ++extern AVCodec ff_hevc_rpi_decoder; + extern AVCodec ff_hevc_v4l2m2m_decoder; + extern AVCodec ff_hnm4_video_decoder; + extern AVCodec ff_hq_hqa_decoder; +@@ -917,6 +918,41 @@ static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id) + } + } + ++static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt) ++{ ++ const enum AVPixelFormat *pf = p->pix_fmts; ++ ++ // Assume good if we lack info ++ if (pf == NULL) ++ return 1; ++ if (fmt == AV_PIX_FMT_NONE) ++ return 0; ++ ++ for (; *pf != AV_PIX_FMT_NONE; ++pf) { ++ if (*pf == fmt) ++ return 1; ++ } ++ return 0; ++} ++ ++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt) ++{ ++ const AVCodec *p, *experimental = NULL; ++ void *i = 0; ++ ++ id= remap_deprecated_codec_id(id); ++ while ((p = av_codec_iterate(&i))) { ++ if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) { ++ if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) { ++ experimental = p; ++ } else ++ return (AVCodec *)p; ++ } ++ p = p->next; ++ } ++ return (AVCodec *)experimental; ++} ++ + static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *)) + { + const AVCodec *p, *experimental = NULL; +diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile +index c4ab93aeeb..cd926f7b33 100644 +--- a/libavcodec/arm/Makefile ++++ b/libavcodec/arm/Makefile +@@ -39,6 +39,8 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ + arm/sbrdsp_init_arm.o + OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o + OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o ++OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o \ ++ arm/rpi_hevcpred_init_arm.o + OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o + OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o + OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o +@@ -137,10 +139,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ + NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o + NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o + NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ ++ arm/hevcdsp_idct_neon.o \ + arm/hevcdsp_deblock_neon.o \ + arm/hevcdsp_idct_neon.o \ + arm/hevcdsp_qpel_neon.o \ + arm/hevcdsp_sao_neon.o ++NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_neon.o \ ++ arm/rpi_hevc_misc_neon.o \ ++ arm/rpi_hevcdsp_deblock_neon.o \ ++ arm/rpi_hevcdsp_idct_neon.o \ ++ arm/rpi_hevcdsp_res8_neon.o \ ++ arm/rpi_hevcdsp_res16_neon.o \ ++ arm/rpi_hevcdsp_sao_neon.o \ ++ arm/rpi_hevcpred_init_neon.o \ ++ arm/rpi_hevcpred_intra_angular_neon.o \ ++ arm/rpi_hevcpred_intra_dc_neon.o \ ++ arm/rpi_hevcpred_intra_filter_neon.o \ ++ arm/rpi_hevcpred_intra_hv_neon.o \ ++ arm/rpi_hevcpred_intra_planar_neon.o + NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o + NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ + arm/rv40dsp_neon.o +diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h +index fdbf86b45e..4755f20e2e 100644 +--- a/libavcodec/arm/cabac.h ++++ b/libavcodec/arm/cabac.h +@@ -26,83 +26,209 @@ + #include "libavutil/internal.h" + #include "libavcodec/cabac.h" + ++ + #define get_cabac_inline get_cabac_inline_arm + static av_always_inline int get_cabac_inline_arm(CABACContext *c, +- uint8_t *const state) ++ uint8_t *state) + { +- int bit; +- void *reg_b, *reg_c, *tmp; ++ const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128; ++ int bit, ptr, low, tmp1, tmp2; ++ __asm__ volatile ( ++ "ldr %[bit], [%[c], %[range_off]] \n\t" ++ "ldrb %[ptr], [%[state]] \n\t" ++ "sub %[tmp1], %[mlps_tables], %[lps_off] \n\t" ++ "and %[tmp2], %[bit], #0xc0 \n\t" ++ "add %[tmp1], %[tmp1], %[ptr] \n\t" ++ "ldr %[low], [%[c], %[low_off]] \n\t" ++ "ldrb %[tmp2], [%[tmp1], %[tmp2], lsl #1] \n\t" ++ "sub %[bit], %[bit], %[tmp2] \n\t" ++ "mov %[tmp1], %[bit] \n\t" ++ "cmp %[low], %[bit], lsl #17 \n\t" ++ "itt ge \n\t" ++ "movge %[tmp1], %[tmp2] \n\t" ++ "mvnge %[ptr], %[ptr] \n\t" ++ "clz %[tmp2], %[tmp1] \n\t" ++ "it ge \n\t" ++ "subge %[low], %[low], %[bit], lsl #17 \n\t" ++ "sub %[tmp2], %[tmp2], #23 \n\t" ++ "and %[bit], %[ptr], #1 \n\t" ++ "ldrb %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t" ++ "lsl %[low], %[low], %[tmp2] \n\t" ++ "lsls %[ptr], %[low], #16 \n\t" ++ "bne 1f \n\t" ++ "ldr %[ptr], [%[c], %[ptr_off]] \n\t" ++ "lsl %[tmp2], %[tmp1], %[tmp2] \n\t" ++#if UNCHECKED_BITSTREAM_READER ++ "strb %[mlps_tables], [%[state]] \n\t" ++ "rbit %[state], %[low] \n\t" ++ "ldrh %[tmp1], [%[ptr]], #2 \n\t" ++#else ++ "ldr %[tmp1], [%[c], %[end_off]] \n\t" ++ "strb %[mlps_tables], [%[state]] \n\t" ++ "rbit %[state], %[low] \n\t" ++ "cmp %[tmp1], %[ptr] \n\t" ++#if CONFIG_THUMB ++ "it cs \n\t" ++ "ldrhcs %[tmp1], [%[ptr]], #2 \n\t" ++#else ++ "ldrcsh %[tmp1], [%[ptr]], #2 \n\t" ++#endif ++#endif ++ "clz %[state], %[state] \n\t" ++ "movw %[mlps_tables], #0xffff \n\t" ++ "sub %[state], %[state], #16 \n\t" ++ "str %[tmp2], [%[c], %[range_off]] \n\t" ++ "rev %[tmp1], %[tmp1] \n\t" ++ "str %[ptr], [%[c], %[ptr_off]] \n\t" ++ "lsr %[tmp1], %[tmp1], #15 \n\t" ++ "sub %[tmp1], %[tmp1], %[mlps_tables] \n\t" ++#if CONFIG_THUMB ++ "lsl %[tmp1], %[tmp1], %[state] \n\t" ++ "add %[low], %[low], %[tmp1] \n\t" ++#else ++ "add %[low], %[low], %[tmp1], lsl %[state] \n\t" ++#endif ++ "str %[low], [%[c], %[low_off]] \n\t" ++ "b 2f \n\t" ++ "1: \n\t" ++ "strb %[mlps_tables], [%[state]] \n\t" ++ "lsl %[tmp1], %[tmp1], %[tmp2] \n\t" ++ "str %[low], [%[c], %[low_off]] \n\t" ++ "str %[tmp1], [%[c], %[range_off]] \n\t" ++ "2: \n\t" ++ : // Outputs ++ [state]"+r"(state), ++ [mlps_tables]"+r"(mlps_tables), ++ [bit]"=&r"(bit), ++ [ptr]"=&r"(ptr), ++ [low]"=&r"(low), ++ [tmp1]"=&r"(tmp1), ++ [tmp2]"=&r"(tmp2) ++ : // Inputs ++ [c]"r"(c), ++ [low_off]"J"(offsetof(CABACContext, low)), ++ [range_off]"J"(offsetof(CABACContext, range)), ++ [ptr_off]"J"(offsetof(CABACContext, bytestream)), ++ [end_off]"J"(offsetof(CABACContext, bytestream_end)), ++ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ return bit; ++} + +- __asm__ volatile( +- "ldrb %[bit] , [%[state]] \n\t" +- "add %[r_b] , %[tables] , %[lps_off] \n\t" +- "mov %[tmp] , %[range] \n\t" +- "and %[range] , %[range] , #0xC0 \n\t" +- "add %[r_b] , %[r_b] , %[bit] \n\t" +- "ldrb %[range] , [%[r_b], %[range], lsl #1] \n\t" +- "add %[r_b] , %[tables] , %[norm_off] \n\t" +- "sub %[r_c] , %[tmp] , %[range] \n\t" +- "lsl %[tmp] , %[r_c] , #17 \n\t" +- "cmp %[tmp] , %[low] \n\t" +- "it gt \n\t" +- "movgt %[range] , %[r_c] \n\t" +- "itt cc \n\t" +- "mvncc %[bit] , %[bit] \n\t" +- "subcc %[low] , %[low] , %[tmp] \n\t" +- "add %[r_c] , %[tables] , %[mlps_off] \n\t" +- "ldrb %[tmp] , [%[r_b], %[range]] \n\t" +- "ldrb %[r_b] , [%[r_c], %[bit]] \n\t" +- "lsl %[low] , %[low] , %[tmp] \n\t" +- "lsl %[range] , %[range] , %[tmp] \n\t" +- "uxth %[r_c] , %[low] \n\t" +- "strb %[r_b] , [%[state]] \n\t" +- "tst %[r_c] , %[r_c] \n\t" +- "bne 2f \n\t" +- "ldr %[r_c] , [%[c], %[byte]] \n\t" ++#define get_cabac_bypass get_cabac_bypass_arm ++static inline int get_cabac_bypass_arm(CABACContext * const c) ++{ ++ uint32_t low = c->low, range, ptr, tmp; ++ int rv; ++ __asm volatile ( ++ "ldr %[range] , [%[c], %[range_off]] \n\t" ++ "mov %[rv] , #0 \n\t" ++ "ldr %[ptr] , [%[c], %[ptr_off]] \n\t" ++ "lsl %[low] , #1 \n\t" ++#if !UNCHECKED_BITSTREAM_READER ++ "ldr %[tmp] , [%[c], %[end_off]] \n\t" ++#endif ++ "cmp %[low] , %[range], lsl #17 \n\t" ++ "itt cs \n\t" ++ "subcs %[low] , %[low], %[range], lsl #17 \n\t" ++ "movcs %[rv] , #1 \n\t" + #if UNCHECKED_BITSTREAM_READER +- "ldrh %[tmp] , [%[r_c]] \n\t" +- "add %[r_c] , %[r_c] , #2 \n\t" +- "str %[r_c] , [%[c], %[byte]] \n\t" ++ "ldrh %[tmp] , [%[ptr]], #2 \n\t" ++#else ++ "cmp %[tmp] , %[ptr] \n\t" ++#if CONFIG_THUMB ++ "it cs \n\t" ++ "ldrhcs %[tmp] , [%[ptr]], #2 \n\t" + #else +- "ldr %[r_b] , [%[c], %[end]] \n\t" +- "ldrh %[tmp] , [%[r_c]] \n\t" +- "cmp %[r_c] , %[r_b] \n\t" +- "itt lt \n\t" +- "addlt %[r_c] , %[r_c] , #2 \n\t" +- "strlt %[r_c] , [%[c], %[byte]] \n\t" ++ "ldrcsh %[tmp] , [%[ptr]], #2 \n\t" ++#endif + #endif +- "sub %[r_c] , %[low] , #1 \n\t" +- "add %[r_b] , %[tables] , %[norm_off] \n\t" +- "eor %[r_c] , %[low] , %[r_c] \n\t" +- "rev %[tmp] , %[tmp] \n\t" +- "lsr %[r_c] , %[r_c] , #15 \n\t" +- "lsr %[tmp] , %[tmp] , #15 \n\t" +- "ldrb %[r_c] , [%[r_b], %[r_c]] \n\t" +- "movw %[r_b] , #0xFFFF \n\t" +- "sub %[tmp] , %[tmp] , %[r_b] \n\t" +- "rsb %[r_c] , %[r_c] , #7 \n\t" +- "lsl %[tmp] , %[tmp] , %[r_c] \n\t" +- "add %[low] , %[low] , %[tmp] \n\t" +- "2: \n\t" +- : [bit]"=&r"(bit), +- [low]"+&r"(c->low), +- [range]"+&r"(c->range), +- [r_b]"=&r"(reg_b), +- [r_c]"=&r"(reg_c), +- [tmp]"=&r"(tmp) +- : [c]"r"(c), +- [state]"r"(state), +- [tables]"r"(ff_h264_cabac_tables), +- [byte]"M"(offsetof(CABACContext, bytestream)), +- [end]"M"(offsetof(CABACContext, bytestream_end)), +- [norm_off]"I"(H264_NORM_SHIFT_OFFSET), +- [lps_off]"I"(H264_LPS_RANGE_OFFSET), +- [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128) +- : "memory", "cc" +- ); ++ "lsls %[range] , %[low], #16 \n\t" ++ "bne 1f \n\t" + +- return bit & 1; ++ "str %[ptr] , [%[c], %[ptr_off]] \n\t" ++ "rev %[tmp] , %[tmp] \n\t" ++ "add %[low] , %[low], %[tmp], lsr #15 \n\t" ++ "movw %[tmp] , 0xFFFF \n\t" ++ "sub %[low] , %[tmp] \n\t" ++ "1: \n\t" ++ "str %[low] , [%[c], %[low_off]] \n\t" ++ : // Outputs ++ [rv]"=&r"(rv), ++ [low]"+r"(low), ++ [range]"=&r"(range), ++ [ptr]"=&r"(ptr), ++ [tmp]"=&r"(tmp) ++ : // Inputs ++ [c]"r"(c), ++ [low_off]"J"(offsetof(CABACContext, low)), ++ [range_off]"J"(offsetof(CABACContext, range)), ++ [ptr_off]"J"(offsetof(CABACContext, bytestream)), ++ [end_off]"J"(offsetof(CABACContext, bytestream_end)) ++ : // Clobbers ++ "memory", "cc" ++ ); ++ return rv; + } ++ ++ ++#define get_cabac_bypass_sign get_cabac_bypass_sign_arm ++static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv) ++{ ++ uint32_t low = c->low, range, ptr, tmp; ++ __asm volatile ( ++ "ldr %[range] , [%[c], %[range_off]] \n\t" ++ "ldr %[ptr] , [%[c], %[ptr_off]] \n\t" ++ "lsl %[low] , #1 \n\t" ++#if !UNCHECKED_BITSTREAM_READER ++ "ldr %[tmp] , [%[c], %[end_off]] \n\t" ++#endif ++ "cmp %[low] , %[range], lsl #17 \n\t" ++ "it cs \n\t" ++ "subcs %[low] , %[low], %[range], lsl #17 \n\t" ++ "it cc \n\t" ++ "rsbcc %[rv] , %[rv], #0 \n\t" ++#if UNCHECKED_BITSTREAM_READER ++ "ldrh %[tmp] , [%[ptr]], #2 \n\t" ++#else ++ "cmp %[tmp] , %[ptr] \n\t" ++#if CONFIG_THUMB ++ "it cs \n\t" ++ "ldrhcs %[tmp] , [%[ptr]], #2 \n\t" ++#else ++ "ldrcsh %[tmp] , [%[ptr]], #2 \n\t" ++#endif ++#endif ++ "lsls %[range] , %[low], #16 \n\t" ++ "bne 1f \n\t" ++ ++ "str %[ptr] , [%[c], %[ptr_off]] \n\t" ++ "rev %[tmp] , %[tmp] \n\t" ++ "add %[low] , %[low], %[tmp], lsr #15 \n\t" ++ "movw %[tmp] , 0xFFFF \n\t" ++ "sub %[low] , %[tmp] \n\t" ++ "1: \n\t" ++ "str %[low] , [%[c], %[low_off]] \n\t" ++ : // Outputs ++ [rv]"+r"(rv), ++ [low]"+r"(low), ++ [range]"=&r"(range), ++ [ptr]"=&r"(ptr), ++ [tmp]"=&r"(tmp) ++ : // Inputs ++ [c]"r"(c), ++ [low_off]"J"(offsetof(CABACContext, low)), ++ [range_off]"J"(offsetof(CABACContext, range)), ++ [ptr_off]"J"(offsetof(CABACContext, bytestream)), ++ [end_off]"J"(offsetof(CABACContext, bytestream_end)) ++ : // Clobbers ++ "memory", "cc" ++ ); ++ return rv; ++} ++ + #endif /* HAVE_ARMV6T2_INLINE */ + + #endif /* AVCODEC_ARM_CABAC_H */ +diff --git a/libavcodec/arm/rpi_hevc_cabac.h b/libavcodec/arm/rpi_hevc_cabac.h +new file mode 100644 +index 0000000000..c88dec6eff +--- /dev/null ++++ b/libavcodec/arm/rpi_hevc_cabac.h +@@ -0,0 +1,607 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_ARM_HEVC_CABAC_H ++#define AVCODEC_ARM_HEVC_CABAC_H ++ ++#include "config.h" ++#if HAVE_ARMV6T2_INLINE ++ ++#define hevc_mem_bits32 hevc_mem_bits32_arm ++static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits) ++{ ++ unsigned int n; ++ __asm__ ( ++ "rev %[n], %[x] \n\t" ++ : [n]"=r"(n) ++ : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3))) ++ : ++ ); ++ return n << (bits & 7); ++} ++ ++ ++// --------------------------------------------------------------------------- ++// ++// Helper fns - little bits of code where ARM has an instraction that the ++// compiler doesn't know about / use ++ ++#define trans_scale_sat trans_scale_sat_arm ++static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift) ++{ ++ int rv; ++ int t = ((level * (int)(scale * scale_m)) >> shift) + 1; ++ ++ __asm__ ( ++ "ssat %[rv], #16, %[t], ASR #1 \n\t" ++ : [rv]"=r"(rv) ++ : [t]"r"(t) ++ : ++ ); ++ return rv; ++} ++ ++#define update_rice update_rice_arm ++static inline void update_rice_arm(uint8_t * const stat_coeff, ++ const unsigned int last_coeff_abs_level_remaining, ++ const unsigned int c_rice_param) ++{ ++ int t = last_coeff_abs_level_remaining << 1; ++ __asm__ ( ++ "lsrs %[t], %[t], %[shift] \n\t" ++ ++ "it eq \n\t" ++ "subeq %[stat], %[stat], #1 \n\t" ++ "cmp %[t], #6 \n\t" ++ "adc %[stat], %[stat], #0 \n\t" ++ "usat %[stat], #8, %[stat] \n\t" ++ : [stat]"+r"(*stat_coeff), ++ [t]"+r"(t) ++ : [shift]"r"(c_rice_param) ++ : "cc" ++ ); ++} ++ ++// --------------------------------------------------------------------------- ++// ++// CABAC get loops ++// ++// Where the loop is simple enough we can normally do 10-30% better than the ++// compiler ++ ++// Get the residual greater than 1 bits ++ ++#define get_cabac_greater1_bits get_cabac_greater1_bits_arm ++static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n, ++ uint8_t * const state0) ++{ ++ unsigned int i, reg_b, st, tmp, bit, rv; ++ __asm__ ( ++ "mov %[i] , #0 \n\t" ++ "mov %[rv] , #0 \n\t" ++ "1: \n\t" ++ "add %[i] , %[i] , #1 \n\t" ++ "cmp %[rv] , #0 \n\t" ++ "ite eq \n\t" ++ "usateq %[st] , #2 , %[i] \n\t" ++ "movne %[st] , #0 \n\t" ++ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t" ++ "and %[tmp] , %[range] , #0xC0 \n\t" ++ ++ "ldrb %[bit] , [%[state0], %[st]] \n\t" ++ "add %[r_b] , %[r_b] , %[bit] \n\t" ++ "ldrb %[tmp] , [%[r_b], %[tmp], lsl #1] \n\t" ++ "sub %[range] , %[range] , %[tmp] \n\t" ++ ++ "cmp %[low] , %[range], lsl #17 \n\t" ++ "ittt ge \n\t" ++ "subge %[low] , %[low] , %[range], lsl #17 \n\t" ++ "movge %[range] , %[tmp] \n\t" ++ "mvnge %[bit] , %[bit] \n\t" ++ ++ "clz %[tmp] , %[range] \n\t" ++ "sub %[tmp] , #23 \n\t" ++ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t" ++ "and %[bit] , %[bit] , #1 \n\t" ++ "strb %[r_b] , [%[state0], %[st]] \n\t" ++ "lsl %[low] , %[low] , %[tmp] \n\t" ++ "orr %[rv] , %[bit] , %[rv], lsl #1 \n\t" ++ "lsl %[range] , %[range] , %[tmp] \n\t" ++ ++// There is a small speed gain from combining both conditions, using a single ++// branch and then working out what that meant later ++ "lsls %[tmp] , %[low] , #16 \n\t" ++ "it ne \n\t" ++ "cmpne %[n] , %[i] \n\t" ++ "bne 1b \n\t" ++ ++// If reload is not required then we must have run out of flags to decode ++ "tst %[tmp] , %[tmp] \n\t" ++ "bne 2f \n\t" ++ ++// Do reload ++ "ldrh %[tmp] , [%[bptr]] , #2 \n\t" ++ "rbit %[bit] , %[low] \n\t" ++ "movw %[r_b] , #0xFFFF \n\t" ++ "clz %[bit] , %[bit] \n\t" ++ "rev %[tmp] , %[tmp] \n\t" ++ "sub %[bit] , %[bit] , #16 \n\t" ++ "cmp %[n] , %[i] \n\t" ++ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t" ++ ++#if CONFIG_THUMB ++ "lsl %[tmp] , %[tmp] , %[bit] \n\t" ++ "add %[low] , %[low] , %[tmp] \n\t" ++#else ++ "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t" ++#endif ++ ++ "bne 1b \n\t" ++ "2: \n\t" ++ : [bit]"=&r"(bit), ++ [low]"+r"(c->low), ++ [range]"+r"(c->range), ++ [r_b]"=&r"(reg_b), ++ [bptr]"+r"(c->bytestream), ++ [i]"=&r"(i), ++ [tmp]"=&r"(tmp), ++ [st]"=&r"(st), ++ [rv]"=&r"(rv) ++ : [state0]"r"(state0), ++ [n]"r"(n), ++ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128), ++ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) ++ : "memory", "cc" ++ ); ++ return rv; ++} ++ ++ ++// n must be > 0 on entry ++#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm ++static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0, ++ unsigned int n, ++ const uint8_t * ctx_map, ++ uint8_t * p) ++{ ++ unsigned int reg_b, tmp, st, bit; ++ __asm__ ( ++// Get bin from map ++#if CONFIG_THUMB ++ "add %[ctx_map] , %[n] \n\t" ++ "ldrb %[st] , [%[ctx_map]] \n\t" ++#else ++ "ldrb %[st] , [%[ctx_map], %[n]]! \n\t" ++#endif ++ "1: \n\t" ++ ++// Load state & ranges ++ "ldrb %[bit] , [%[state0], %[st]] \n\t" ++ "and %[tmp] , %[range] , #0xC0 \n\t" ++ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t" ++ "add %[r_b] , %[r_b] , %[tmp], lsl #1 \n\t" ++ "ldrb %[tmp] , [%[r_b], %[bit]] \n\t" ++ "sub %[range] , %[range] , %[tmp] \n\t" ++ ++ "cmp %[low] , %[range], lsl #17 \n\t" ++ "ittt ge \n\t" ++ "mvnge %[bit] , %[bit] \n\t" ++ "subge %[low] , %[low] , %[range], lsl #17 \n\t" ++ "movge %[range] , %[tmp] \n\t" ++ ++// Renorm ++ "clz %[tmp] , %[range] \n\t" ++ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t" ++ "sub %[tmp] , #23 \n\t" ++ "strb %[r_b] , [%[state0], %[st]] \n\t" ++ "tst %[bit] , #1 \n\t" ++ "ldrb %[st] , [%[ctx_map], #-1]! \n\t" ++ "lsl %[low] , %[low] , %[tmp] \n\t" ++// GCC asm seems to need strbne written differently for thumb and arm ++#if CONFIG_THUMB ++ "it ne \n\t" ++ "strbne %[n] , [%[idx]] , #1 \n\t" ++#else ++ "strneb %[n] , [%[idx]] , #1 \n\t" ++#endif ++ ++// There is a small speed gain from combining both conditions, using a single ++// branch and then working out what that meant later ++ "subs %[n] , %[n] , #1 \n\t" ++ "lsl %[range] , %[range] , %[tmp] \n\t" ++#if CONFIG_THUMB ++ "itt ne \n\t" ++ "lslsne %[tmp] , %[low] , #16 \n\t" ++#else ++ "lslnes %[tmp] , %[low] , #16 \n\t" ++#endif ++ "bne 1b \n\t" ++ ++// If we have bits left then n must be 0 so give up now ++ "lsls %[tmp] , %[low] , #16 \n\t" ++ "bne 2f \n\t" ++ ++// Do reload ++ "ldrh %[tmp] , [%[bptr]] , #2 \n\t" ++ "rbit %[bit] , %[low] \n\t" ++ "movw %[r_b] , #0xFFFF \n\t" ++ "clz %[bit] , %[bit] \n\t" ++ "cmp %[n] , #0 \n\t" ++ "rev %[tmp] , %[tmp] \n\t" ++ "sub %[bit] , %[bit] , #16 \n\t" ++ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t" ++ ++#if CONFIG_THUMB ++ "lsl %[tmp] , %[tmp] , %[bit] \n\t" ++ "add %[low] , %[low] , %[tmp] \n\t" ++#else ++ "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t" ++#endif ++ ++// Check to see if we still have more to do ++ "bne 1b \n\t" ++ "2: \n\t" ++ : [bit]"=&r"(bit), ++ [low]"+r"(c->low), ++ [range]"+r"(c->range), ++ [r_b]"=&r"(reg_b), ++ [bptr]"+r"(c->bytestream), ++ [idx]"+r"(p), ++ [n]"+r"(n), ++ [tmp]"=&r"(tmp), ++ [st]"=&r"(st), ++ [ctx_map]"+r"(ctx_map) ++ : [state0]"r"(state0), ++ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128), ++ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) ++ : "memory", "cc" ++ ); ++ ++ return p; ++} ++ ++// --------------------------------------------------------------------------- ++// ++// CABAC_BY22 functions ++ ++ ++#define get_cabac_by22_start get_cabac_by22_start_arm ++static inline void get_cabac_by22_start_arm(CABACContext * const c) ++{ ++ const uint8_t *ptr = c->bytestream; ++ register uint32_t low __asm__("r1"), range __asm__("r2"); ++ uint32_t m, range8, bits; ++#if !USE_BY22_DIV ++ uintptr_t inv; ++#endif ++ ++ av_assert2(offsetof (CABACContext, low) == 0); ++ av_assert2(offsetof (CABACContext, range) == 4); ++ av_assert2(offsetof (CABACContext, by22.range) == offsetof (CABACContext, by22.bits) + 2); ++ __asm__ volatile ( ++ "ldmia %[c], {%[low], %[range]} \n\t" ++ : // Outputs ++ [low]"=r"(low), ++ [range]"=r"(range) ++ : // Inputs ++ [c]"r"(c) ++ : // Clobbers ++ ); ++#if !USE_BY22_DIV ++ inv = (uintptr_t)cabac_by22_inv_range; ++#endif ++ __asm__ volatile ( ++ "ldr %[m], [%[ptr]], #-("AV_STRINGIFY(CABAC_BITS)"/8) \n\t" ++#if !USE_BY22_DIV ++ "uxtb %[range8], %[range] \n\t" ++#endif ++ "rbit %[bits], %[low] \n\t" ++ "lsl %[low], %[low], #22 - "AV_STRINGIFY(CABAC_BITS)" \n\t" ++ "clz %[bits], %[bits] \n\t" ++ "str %[ptr], [%[c], %[ptr_off]] \n\t" ++ "rev %[m], %[m] \n\t" ++ "rsb %[ptr], %[bits], #9 + "AV_STRINGIFY(CABAC_BITS)" \n\t" ++ "eor %[m], %[m], #0x80000000 \n\t" ++#if !USE_BY22_DIV ++ "ldr %[inv], [%[inv], %[range8], lsl #2] \n\t" ++ "pkhbt %[range], %[bits], %[range], lsl #16 \n\t" ++ "str %[range], [%[c], %[bits_off]] \n\t" ++#else ++ "strh %[bits], [%[c], %[bits_off]] \n\t" ++#endif ++#if CONFIG_THUMB ++ "lsr %[m], %[ptr] \n\t" ++ "eor %[range], %[low], %[m] \n\t" ++#else ++ "eor %[range], %[low], %[m], lsr %[ptr] \n\t" ++#endif ++ : // Outputs ++ [ptr]"+&r"(ptr), ++ [low]"+&r"(low), ++ [range]"+&r"(range), ++#if !USE_BY22_DIV ++ [inv]"+&r"(inv), ++#endif ++ [m]"=&r"(m), ++ [range8]"=&r"(range8), ++ [bits]"=&r"(bits) ++ : // Inputs ++ [c]"r"(c), ++ [bits_off]"J"(offsetof (CABACContext, by22.bits)), ++ [ptr_off]"J"(offsetof (CABACContext, bytestream)) ++ : // Clobbers ++ "memory" ++ ); ++ c->low = range; ++#if !USE_BY22_DIV ++ c->range = inv; ++#endif ++} ++ ++#define get_cabac_by22_peek get_cabac_by22_peek_arm ++static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c) ++{ ++ uint32_t rv = c->low &~ 1, tmp; ++ __asm__ ( ++ "cmp %[inv] , #0 \n\t" ++ "it ne \n\t" ++ "umullne %[tmp] , %[rv] , %[inv], %[rv] \n\t" ++ : // Outputs ++ [rv]"+r"(rv), ++ [tmp]"=r"(tmp) ++ : // Inputs ++ [inv]"r"(c->range) ++ : // Clobbers ++ "cc" ++ ); ++ return rv << 1; ++} ++ ++#define get_cabac_by22_flush get_cabac_by22_flush_arm ++static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val) ++{ ++ uint32_t bits, ptr, tmp1, tmp2; ++ __asm__ volatile ( ++ "ldrh %[bits], [%[cc], %[bits_off]] \n\t" ++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" ++ "rsb %[tmp1], %[n], #32 \n\t" ++ "add %[bits], %[bits], %[n] \n\t" ++ "ldrh %[tmp2], [%[cc], %[range_off]] \n\t" ++ "lsr %[tmp1], %[val], %[tmp1] \n\t" ++ "ldr %[val], [%[cc], %[low_off]] \n\t" ++#if CONFIG_THUMB ++ "add %[ptr], %[ptr], %[bits], lsr #3 \n\t" ++ "ldr %[ptr], [%[ptr]] \n\t" ++#else ++ "ldr %[ptr], [%[ptr], %[bits], lsr #3] \n\t" ++#endif ++ "mul %[tmp1], %[tmp2], %[tmp1] \n\t" ++ "and %[tmp2], %[bits], #7 \n\t" ++ "strh %[bits], [%[cc], %[bits_off]] \n\t" ++ "rev %[ptr], %[ptr] \n\t" ++ "lsl %[tmp1], %[tmp1], #23 \n\t" ++#if CONFIG_THUMB ++ "lsl %[val], %[n] \n\t" ++ "sub %[val], %[tmp1] \n\t" ++#else ++ "rsb %[val], %[tmp1], %[val], lsl %[n] \n\t" ++#endif ++ "lsl %[ptr], %[ptr], %[tmp2] \n\t" ++ "orr %[val], %[val], %[ptr], lsr #9 \n\t" ++ "str %[val], [%[cc], %[low_off]] \n\t" ++ : // Outputs ++ [val]"+r"(val), ++ [bits]"=&r"(bits), ++ [ptr]"=&r"(ptr), ++ [tmp1]"=&r"(tmp1), ++ [tmp2]"=&r"(tmp2) ++ : // Inputs ++ [cc]"r"(c), ++ [n]"r"(n), ++ [bits_off]"J"(offsetof(CABACContext, by22.bits)), ++ [ptr_off]"J"(offsetof(CABACContext, bytestream)), ++ [range_off]"J"(offsetof(CABACContext, by22.range)), ++ [low_off]"J"(offsetof(CABACContext, low)) ++ : // Clobbers ++ "memory" ++ ); ++} ++ ++#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm ++static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param) ++{ ++ uint32_t last_coeff_abs_level_remaining; ++ uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2; ++ __asm__ volatile ( ++ "ldr %[remain], [%[cc], %[low_off]] \n\t" ++ "ldr %[prefix], [%[cc], %[range_off]] \n\t" ++ "bic %[remain], %[remain], #1 \n\t" ++ "ldrh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" ++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" ++ "cmp %[prefix], #0 \n\t" ++ "it ne \n\t" ++ "umullne %[prefix], %[remain], %[prefix], %[remain] \n\t" ++ "ldrh %[range], [%[cc], %[by22_range_off]] \n\t" ++ "lsl %[remain], %[remain], #1 \n\t" ++ "mvn %[prefix], %[remain] \n\t" ++ "clz %[prefix], %[prefix] \n\t" ++ "rsbs %[n1], %[prefix], #2 \n\t" ++ "bcc 1f \n\t" ++ "adc %[n1], %[rice], %[prefix] \n\t" ++ "add %[tmp2], %[tmp2], %[n1] \n\t" ++ "rsb %[n2], %[n1], #32 \n\t" ++ "and %[tmp1], %[tmp2], #7 \n\t" ++ "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" ++ "lsr %[tmp2], %[tmp2], #3 \n\t" ++ "lsr %[n2], %[remain], %[n2] \n\t" ++ "mul %[n2], %[range], %[n2] \n\t" ++ "ldr %[range], [%[cc], %[low_off]] \n\t" ++ "ldr %[ptr], [%[ptr], %[tmp2]] \n\t" ++ "rsb %[tmp2], %[rice], #31 \n\t" ++ "lsl %[remain], %[remain], %[prefix] \n\t" ++ "lsl %[n2], %[n2], #23 \n\t" ++#if CONFIG_THUMB ++ "lsl %[range], %[n1] \n\t" ++ "sub %[range], %[n2] \n\t" ++#else ++ "rsb %[range], %[n2], %[range], lsl %[n1] \n\t" ++#endif ++ "rev %[ptr], %[ptr] \n\t" ++ "lsl %[n2], %[prefix], %[rice] \n\t" ++#if CONFIG_THUMB ++ "lsr %[remain], %[tmp2] \n\t" ++ "add %[remain], %[n2] \n\t" ++#else ++ "add %[remain], %[n2], %[remain], lsr %[tmp2] \n\t" ++#endif ++ "b 3f \n\t" ++ "1: \n\t" ++ "add %[n2], %[rice], %[prefix], lsl #1 \n\t" ++ "cmp %[n2], %[peek_bits_plus_2] \n\t" ++ "bhi 2f \n\t" ++ "sub %[n1], %[n2], #2 \n\t" ++ "add %[tmp2], %[tmp2], %[n1] \n\t" ++ "rsb %[n2], %[n1], #32 \n\t" ++ "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" ++ "lsr %[tmp1], %[tmp2], #3 \n\t" ++ "lsr %[n2], %[remain], %[n2] \n\t" ++ "mul %[n2], %[range], %[n2] \n\t" ++ "rsb %[range], %[rice], #34 \n\t" ++ "ldr %[ptr], [%[ptr], %[tmp1]] \n\t" ++ "and %[tmp1], %[tmp2], #7 \n\t" ++ "lsl %[remain], %[remain], %[prefix] \n\t" ++ "ldr %[tmp2], [%[cc], %[low_off]] \n\t" ++ "rsb %[prefix], %[prefix], %[range] \n\t" ++ "orr %[remain], %[remain], #0x80000000 \n\t" ++ "rev %[ptr], %[ptr] \n\t" ++ "lsl %[n2], %[n2], #23 \n\t" ++ "mov %[range], #2 \n\t" ++#if CONFIG_THUMB ++ "lsl %[tmp2], %[n1] \n\t" ++ "sub %[tmp2], %[n2] \n\t" ++#else ++ "rsb %[tmp2], %[n2], %[tmp2], lsl %[n1] \n\t" ++#endif ++ "lsl %[ptr], %[ptr], %[tmp1] \n\t" ++ "lsl %[rice], %[range], %[rice] \n\t" ++ "orr %[range], %[tmp2], %[ptr], lsr #9 \n\t" ++#if CONFIG_THUMB ++ "lsr %[remain], %[prefix] \n\t" ++ "add %[remain], %[rice] \n\t" ++#else ++ "add %[remain], %[rice], %[remain], lsr %[prefix] \n\t" ++#endif ++ "b 4f \n\t" ++ "2: \n\t" ++ "add %[n1], %[tmp2], %[prefix] \n\t" ++#if CONFIG_THUMB ++ "add %[tmp2], %[ptr], %[n1], lsr #3 \n\t" ++ "ldr %[tmp2], [%[tmp2]] \n\t" ++#else ++ "ldr %[tmp2], [%[ptr], %[n1], lsr #3] \n\t" ++#endif ++ "rsb %[tmp1], %[prefix], #32 \n\t" ++ "push {%[rice]} \n\t" ++ "and %[rice], %[n1], #7 \n\t" ++ "lsr %[tmp1], %[remain], %[tmp1] \n\t" ++ "ldr %[ptr], [%[cc], %[low_off]] \n\t" ++ "mul %[remain], %[range], %[tmp1] \n\t" ++ "rev %[tmp2], %[tmp2] \n\t" ++ "rsb %[n2], %[prefix], %[n2] \n\t" ++ "ldr %[tmp1], [%[cc], %[range_off]] \n\t" ++ "lsl %[rice], %[tmp2], %[rice] \n\t" ++ "sub %[tmp2], %[n2], #2 \n\t" ++ "lsl %[remain], %[remain], #23 \n\t" ++#if CONFIG_THUMB ++ "lsl %[ptr], %[prefix] \n\t" ++ "rsb %[remain], %[ptr] \n\t" ++#else ++ "rsb %[remain], %[remain], %[ptr], lsl %[prefix] \n\t" ++#endif ++ "orr %[remain], %[remain], %[rice], lsr #9 \n\t" ++ "add %[prefix], %[n1], %[tmp2] \n\t" ++ "bic %[n1], %[remain], #1 \n\t" ++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" ++ "cmp %[tmp1], #0 \n\t" ++ "rsb %[rice], %[tmp2], #32 \n\t" ++ "it ne \n\t" ++ "umullne %[tmp1], %[n1], %[tmp1], %[n1] \n\t" ++ "and %[tmp1], %[prefix], #7 \n\t" ++#if CONFIG_THUMB ++ "add %[ptr], %[ptr], %[prefix], lsr #3 \n\t" ++ "ldr %[ptr], [%[ptr]] \n\t" ++#else ++ "ldr %[ptr], [%[ptr], %[prefix], lsr #3] \n\t" ++#endif ++ "lsl %[n1], %[n1], #1 \n\t" ++ "lsr %[rice], %[n1], %[rice] \n\t" ++ "rsb %[n2], %[n2], #34 \n\t" ++ "mul %[range], %[range], %[rice] \n\t" ++ "pop {%[rice]} \n\t" ++ "rev %[ptr], %[ptr] \n\t" ++ "orr %[n1], %[n1], #0x80000000 \n\t" ++ "strh %[prefix], [%[cc], %[by22_bits_off]] \n\t" ++ "mov %[prefix], #2 \n\t" ++ "lsl %[range], %[range], #23 \n\t" ++#if CONFIG_THUMB ++ "lsl %[remain], %[tmp2] \n\t" ++ "rsb %[range], %[remain] \n\t" ++#else ++ "rsb %[range], %[range], %[remain], lsl %[tmp2] \n\t" ++#endif ++ "lsl %[remain], %[prefix], %[rice] \n\t" ++#if CONFIG_THUMB ++ "lsr %[n1], %[n2] \n\t" ++ "add %[remain], %[n1] \n\t" ++#else ++ "add %[remain], %[remain], %[n1], lsr %[n2] \n\t" ++#endif ++ "3: \n\t" ++ "lsl %[ptr], %[ptr], %[tmp1] \n\t" ++ "orr %[range], %[range], %[ptr], lsr #9 \n\t" ++ "4: \n\t" ++ "str %[range], [%[cc], %[low_off]] \n\t" ++ : // Outputs ++ [remain]"=&r"(last_coeff_abs_level_remaining), ++ [rice]"+r"(rice_param), ++ [prefix]"=&r"(prefix), ++ [n1]"=&r"(n1), ++ [range]"=&r"(range), ++ [n2]"=&r"(n2), ++ [ptr]"=&r"(ptr), ++ [tmp1]"=&r"(tmp1), ++ [tmp2]"=&r"(tmp2) ++ : // Inputs ++ [cc]"r"(c), ++ [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2), ++ [low_off]"J"(offsetof(CABACContext, low)), ++ [range_off]"J"(offsetof(CABACContext, range)), ++ [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)), ++ [by22_range_off]"J"(offsetof(CABACContext, by22.range)), ++ [ptr_off]"J"(offsetof(CABACContext, bytestream)) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ return last_coeff_abs_level_remaining; ++} ++ ++#endif /* HAVE_ARMV6T2_INLINE */ ++ ++#endif /* AVCODEC_ARM_HEVC_CABAC_H */ +diff --git a/libavcodec/arm/rpi_hevc_idct_fn_neon.S b/libavcodec/arm/rpi_hevc_idct_fn_neon.S +new file mode 100644 +index 0000000000..978b7b6947 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S +@@ -0,0 +1,183 @@ ++/* ++ * ARM NEON optimised IDCT functions for HEVC decoding ++ * Copyright (c) 2014 Seppo Tomperi ++ * Copyright (C) 2018 John Cox, ben Avison for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++@ Included multiple times from hevc_idct_neon.S ++@ Macros defined there ++ ++#define DC_SHIFT (15 - BIT_DEPTH) ++#define DC_ADD (1 | (1 << (14 - BIT_DEPTH))) ++#define TRN_SHIFT (20 - BIT_DEPTH) ++ ++function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r1, #DC_ADD ++ asr r1, #DC_SHIFT ++ vdup.16 q0, r1 ++ vdup.16 q1, r1 ++ vst1.16 {q0, q1}, [r0] ++ bx lr ++endfunc ++ ++function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r2, r0, #32 ++ mov r3, #64 ++ add r1, #DC_ADD ++ asr r1, #DC_SHIFT ++ vdup.16 q8, r1 ++ vdup.16 q9, r1 ++ vst1.16 {q8, q9}, [r0], r3 ++ vst1.16 {q8, q9}, [r2], r3 ++ vst1.16 {q8, q9}, [r0] ++ vst1.16 {q8, q9}, [r2] ++ bx lr ++endfunc ++ ++function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r2, r0, #32 ++ mov r3, #64 ++ add r1, #DC_ADD ++ mov ip, #16*16 ++ asr r1, #DC_SHIFT ++ vdup.16 q8, r1 ++ vdup.16 q9, r1 ++1: vst1.16 {q8, q9}, [r0], r3 ++ subs ip, ip, #32 ++ vst1.16 {q8, q9}, [r2], r3 ++ bhi 1b ++ bx lr ++endfunc ++ ++function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r2, r0, #32 ++ mov r3, #64 ++ add r1, #DC_ADD ++ mov ip, #32*32 ++ asr r1, #DC_SHIFT ++ vdup.16 q8, r1 ++ vdup.16 q9, r1 ++1: vst1.16 {q8, q9}, [r0], r3 ++ subs ip, ip, #32 ++ vst1.16 {q8, q9}, [r2], r3 ++ bhi 1b ++ bx lr ++endfunc ++ ++ ++function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1 ++ vldr.i32 s0, =0x00240053 // 36 and 83 ++ vld1.16 {q14, q15}, [r0 :256] // coeffs ++ ++ tr4_shift #7 ++ ++ vzip.16 d28, d29 ++ vzip.16 d30, d31 ++ vzip.32 q14, q15 ++ ++ tr4_shift #TRN_SHIFT ++ ++ vst4.16 {q14, q15}, [r0 :256] ++ bx lr ++ ++ .ltorg ++endfunc ++ ++ ++ ++function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1 ++ vmov.i32 d0, #0x4a // 74 ++ vld1.16 {q14, q15}, [r0 :256] // coeffs ++ vmov.i32 d1, #0x1d // 29 ++ vmov.i32 d2, #0x37 // 55 ++ ++ tr4_luma_shift #7 ++ ++ vzip.16 d28, d29 ++ vzip.16 d30, d31 ++ vzip.32 q14, q15 ++ ++ tr4_luma_shift #TRN_SHIFT ++ ++ vst4.16 {q14, q15}, [r0 :256] ++ bx lr ++endfunc ++ ++function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1 ++ add r2, r0, #16 ++ adr r3, tr4f ++ vpush {d8-d15} ++ vld1.16 {d0, d1}, [r3] ++ mov r3, #32 ++ ++ tr8_vert d16, d17, d18, d19, d24, d25, d26, d27, q8, q9, \ ++ "sub r0, r0, #128-8", \ ++ "sub r2, r2, #128-8", \ ++ "cmp r1, #4" ++ ble 2f ++ ++ tr8_vert d20, d21, d22, d23, d28, d29, d30, d31, q10, q11, \ ++ "sub r0, r0, #128+8", \ ++ "sub r2, r2, #128+8+16-32", \ ++ "mov r3, #64" ++ ++ vzip.16 d16, d17 ++ vzip.16 d18, d19 ++ ++ vzip.16 d20, d21 ++ vzip.16 d22, d23 ++ vzip.16 d28, d29 ++ vzip.16 d30, d31 ++ vzip.32 q10, q11 ++ vzip.32 q14, q15 ++1: ++ vzip.16 d24, d25 ++ vzip.16 d26, d27 ++ vzip.32 q8, q9 ++ vzip.32 q12, q13 ++ ++ tr8_horiz d16, d17, d18, d19, d20, d21, d22, d23, q8, q9, TRN_SHIFT ++ tr8_horiz d24, d25, d26, d27, d28, d29, d30, d31, q12, q13, TRN_SHIFT ++ ++ vpop {d8-d15} ++ bx lr ++ ++2: vmov.i64 q10, #0 ++ sub r0, r0, #8 ++ vmov.i64 q11, #0 ++ sub r2, r2, #8+16-32 ++ vmov.i64 q14, #0 ++ mov r3, #64 ++ vmov.i64 q15, #0 ++ ++ vzip.16 d16, d17 ++ vzip.16 d18, d19 ++ ++ b 1b ++ ++endfunc ++ ++#undef DC_SHIFT ++#undef DC_ADD ++#undef TRN_SHIFT ++ +diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S +new file mode 100644 +index 0000000000..161bb0d7c9 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevc_misc_neon.S +@@ -0,0 +1,267 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Written by John Cox, Ben Avison ++*/ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ rpi_zap_coeff_vals_neon( ++@ uint16_t * buf, [r0] ++@ unsigned int log_n_m2) [r1] ++ ++function rpi_zap_coeff_vals_neon, export=1 ++ mov ip, #1 ++ vmov.i64 q0, #0 ++ teq r1, #0 ++ vmov.i64 q1, #0 ++ beq 2f ++ ++ lsl ip, r1 @ 2, 4 or 8 ++ add r2, r0, #32 ++ lsl ip, r1 @ 4, 16 or 64 = number of 32-byte blocks to zero ++ mov r3, #64 ++1: vst1.8 {q0,q1}, [r0:256], r3 ++ subs ip, #2 ++ vst1.8 {q0,q1}, [r2:256], r3 ++ bne 1b ++ bx lr ++ ++2: vst1.8 {q0,q1}, [r0:256] ++ bx lr ++endfunc ++ ++@ PIC jump tables are more expensive than absolute for A32 code ++.set jent_pic, CONFIG_PIC || CONFIG_THUMB ++ ++@ Jump table entry - if in neon mode the bottom bit must be set ++@ ? There is probably a real asm instruction to do this but I haven't found it ++.macro jent lab ++.if jent_pic ++T .short ((0 + \lab) - (0 + 98b)) / 2 ++A .short (0 + \lab) - (4 + 98b) ++.else ++T .word 1 + \lab ++A .word \lab ++.endif ++.endm ++ ++.set expected_next, 0 ++ ++.macro cpy_compound val, p1, p2, drop_thru=0 ++.if \p1 + \p2 != \val ++.error "Bad addition! \p1 + \p2 != \val" ++.endif ++.if expected_next != 0 && expected_next != \val ++.error "Drop thru failure" ++.endif ++\val\(): ++ push {r0-r3} ++ bl 100\p1\()b ++ pop {r0-r3} ++ add r0, #\p1 ++ add r2, #\p1 ++.if \drop_thru == 0 ++ b \p2\()b ++.set expected_next, 0 ++.else ++.set expected_next, \p2 ++.endif ++.endm ++ ++@ ff_hevc_cpy_blks8x4_neon( ++@ dst [r0] ++@ dst_stride [r1] ++@ src [r2] ++@ src_stride [r3] ++@ width [sp, #0] (bytes) ++@ height) [sp, #4] ++@ ++@ Power of 2 widths are directly coded, all others are done in stripes ++@ We expect the vast majority of calls to be power of 2 ++@ ++@ Currently has min width of 8, but we could make that 4 without issue ++@ Min height is 4 ++ ++function ff_hevc_rpi_cpy_blks8x4_neon, export=1 ++ ldr r12, [sp, #0] ++ push {r11, lr} ++.if jent_pic ++A adr lr, 98f - 2 ++.else ++A adr lr, 98f - 4 ++.endif ++ lsr r12, #3 ++ ldr r11, [sp, #(8 + 4)] ++.if jent_pic ++A lsl r12, #1 ++A ldrsh lr, [lr, r12] ++A add pc, lr ++T tbh [pc, r12, lsl #1] ++.else ++ @ A32 only, Thumb is always PIC ++ ldr pc, [lr, r12, lsl #2] ++.endif ++ ++98: ++T .short 0 @ unused ++ jent 8f ++ jent 16f ++ jent 24f ++ jent 32f ++ jent 40f ++ jent 48f ++ jent 56f ++ jent 64f ++ jent 72f ++ jent 80f ++ jent 88f ++ jent 96f ++ jent 104f ++ jent 112f ++ jent 120f ++ jent 128f ++ ++1008: ++ push {r11, lr} ++8: ++ add lr, r2, r3 ++ lsl r3, #1 ++ add r12, r0, r1 ++ lsl r1, #1 ++1: ++ vld1.32 {d0 }, [r2], r3 ++ vld1.32 {d1 }, [lr], r3 ++ vld1.32 {d2 }, [r2], r3 ++ vld1.32 {d3 }, [lr], r3 ++ subs r11, #4 ++ vst1.32 {d0 }, [r0], r1 ++ vst1.32 {d1 }, [r12], r1 ++ vst1.32 {d2 }, [r0], r1 ++ vst1.32 {d3 }, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++10016: ++ push {r11, lr} ++16: ++ add lr, r2, r3 ++ lsl r3, #1 ++ add r12, r0, r1 ++ lsl r1, #1 ++1: ++ vld1.32 {q0 }, [r2], r3 ++ vld1.32 {q1 }, [lr], r3 ++ vld1.32 {q2 }, [r2], r3 ++ vld1.32 {q3 }, [lr], r3 ++ subs r11, #4 ++ vst1.32 {q0 }, [r0], r1 ++ vst1.32 {q1 }, [r12], r1 ++ vst1.32 {q2 }, [r0], r1 ++ vst1.32 {q3 }, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++10032: ++ push {r11, lr} ++32: ++ add lr, r2, r3 ++ lsl r3, #1 ++ add r12, r0, r1 ++ lsl r1, #1 ++1: ++ vld1.32 {q8, q9 }, [r2], r3 ++ vld1.32 {q10, q11}, [lr], r3 ++ vld1.32 {q12, q13}, [r2], r3 ++ vld1.32 {q14, q15}, [lr], r3 ++ subs r11, #4 ++ vst1.32 {q8, q9 }, [r0], r1 ++ vst1.32 {q10, q11}, [r12], r1 ++ vst1.32 {q12, q13}, [r0], r1 ++ vst1.32 {q14, q15}, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++10064: ++ push {r11, lr} ++64: ++ add lr, r2, #32 ++ add r12, r0, #32 ++1: ++ vld1.32 {q8, q9 }, [r2], r3 ++ vld1.32 {q10, q11}, [lr], r3 ++ vld1.32 {q12, q13}, [r2], r3 ++ vld1.32 {q14, q15}, [lr], r3 ++ subs r11, #2 ++ vst1.32 {q8, q9 }, [r0], r1 ++ vst1.32 {q10, q11}, [r12], r1 ++ vst1.32 {q12, q13}, [r0], r1 ++ vst1.32 {q14, q15}, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++128: ++ push {r4, r5} ++ @ We could do this with fewer registers if we jump around but I ++ @ have a primative urge to load sequentially ++ mov r4, #64 ++ add lr, r2, #32 ++ add r12, r0, #32 ++ sub r3, r4 ++ sub r1, r4 ++1: ++ vld1.32 {q8, q9 }, [r2], r4 ++ vld1.32 {q10, q11}, [lr], r4 ++ vld1.32 {q12, q13}, [r2], r3 ++ vld1.32 {q14, q15}, [lr], r3 ++ subs r11, #1 ++ vst1.32 {q8, q9 }, [r0], r4 ++ vst1.32 {q10, q11}, [r12], r4 ++ vst1.32 {q12, q13}, [r0], r1 ++ vst1.32 {q14, q15}, [r12], r1 ++ bgt 1b ++ pop {r4, r5, r11, pc} ++ ++@ Use drop_thru where we can ++cpy_compound 104, 64, 40, 1 ++cpy_compound 40, 32, 8 ++ ++cpy_compound 112, 64, 48, 1 ++cpy_compound 48, 32, 16 ++ ++cpy_compound 120, 64, 56, 1 ++cpy_compound 56, 32, 24, 1 ++cpy_compound 24, 16, 8 ++ ++cpy_compound 72, 64, 8 ++cpy_compound 80, 64, 16 ++cpy_compound 88, 64, 24 ++cpy_compound 96, 64, 32 ++ ++ ++endfunc ++ +diff --git a/libavcodec/arm/rpi_hevc_misc_neon.h b/libavcodec/arm/rpi_hevc_misc_neon.h +new file mode 100644 +index 0000000000..9d21f6a882 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevc_misc_neon.h +@@ -0,0 +1,438 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H ++#define AVCODEC_ARM_RPI_HEVC_MISC_H ++ ++#include "config.h" ++#if HAVE_NEON_INLINE && !CONFIG_THUMB ++ ++static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_src) ++{ ++ const uint8_t *src2 = src + stride_src; ++ stride_src <<= 1; ++ switch (pixel_shift) ++ { ++ case 2: ++ __asm__ volatile ( ++ "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t" ++ "subs %[height], #4 \n\t" ++ "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.32 {d2[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d2[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.32 {d3[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d3[1]}, [%[src2]], %[stride_src] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.32 {q0}, [%[dst]]! \n\t" ++ "beq 3f \n\t" ++ "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.32 {q1}, [%[dst]]! \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vst1.32 {q0}, [%[dst]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vst1.32 {q1}, [%[dst]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [src]"+r"(src), ++ [src2]"+r"(src2), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ case 1: ++ __asm__ volatile ( ++ "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t" ++ "subs %[height], #4 \n\t" ++ "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.16 {d2[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d3[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.16 {d2[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d3[1]}, [%[src2]], %[stride_src] \n\t" ++ "vzip.16 d0, d1 \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.16 {d0}, [%[dst]]! \n\t" ++ "beq 3f \n\t" ++ "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "vzip.16 d2, d3 \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.16 {d2}, [%[dst]]! \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vzip.16 d0, d1 \n\t" ++ "vst1.16 {d0}, [%[dst]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vzip.16 d2, d3 \n\t" ++ "vst1.16 {d2}, [%[dst]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [src]"+r"(src), ++ [src2]"+r"(src2), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ default: ++ __asm__ volatile ( ++ "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t" ++ "subs %[height], #8 \n\t" ++ "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.8 {d2[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d3[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d2[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d3[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d2[2]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d3[2]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d2[3]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d3[3]}, [%[src2]], %[stride_src] \n\t" ++ "vzip.8 d0, d1 \n\t" ++ "subs %[height], #8 \n\t" ++ "vst1.8 {d0}, [%[dst]]! \n\t" ++ "beq 3f \n\t" ++ "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t" ++ "vzip.8 d2, d3 \n\t" ++ "subs %[height], #8 \n\t" ++ "vst1.8 {d2}, [%[dst]]! \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vzip.8 d0, d1 \n\t" ++ "vst1.8 {d0}, [%[dst]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vzip.8 d2, d3 \n\t" ++ "vst1.8 {d2}, [%[dst]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [src]"+r"(src), ++ [src2]"+r"(src2), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ } ++} ++ ++static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_dst) ++{ ++ uint8_t *dst2 = dst + stride_dst; ++ stride_dst <<= 1; ++ switch (pixel_shift) ++ { ++ case 2: ++ __asm__ volatile ( ++ "subs %[height], #4 \n\t" ++ "vld1.32 {q0}, [%[src]]! \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.32 {q1}, [%[src]]! \n\t" ++ "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.32 {d1[0]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.32 {d1[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "beq 3f \n\t" ++ "vld1.32 {q0}, [%[src]]! \n\t" ++ "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.32 {d3[0]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.32 {d3[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.32 {d1[0]}, [%[dst]] \n\t" ++ "vst1.32 {d1[1]}, [%[dst2]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.32 {d3[0]}, [%[dst]] \n\t" ++ "vst1.32 {d3[1]}, [%[dst2]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [dst]"+r"(dst), ++ [dst2]"+r"(dst2), ++ [src]"+r"(src), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ case 1: ++ __asm__ volatile ( ++ "subs %[height], #4 \n\t" ++ "vld1.16 {d0}, [%[src]]! \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.16 {d2}, [%[src]]! \n\t" ++ "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.16 {d0[2]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.16 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "beq 3f \n\t" ++ "vld1.16 {d0}, [%[src]]! \n\t" ++ "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.16 {d2[2]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.16 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.16 {d0[2]}, [%[dst]] \n\t" ++ "vst1.16 {d0[3]}, [%[dst2]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.16 {d2[2]}, [%[dst]] \n\t" ++ "vst1.16 {d2[3]}, [%[dst2]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [dst]"+r"(dst), ++ [dst2]"+r"(dst2), ++ [src]"+r"(src), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ default: ++ __asm__ volatile ( ++ "subs %[height], #8 \n\t" ++ "vld1.8 {d0}, [%[src]]! \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.8 {d2}, [%[src]]! \n\t" ++ "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[6]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #8 \n\t" ++ "vst1.8 {d0[7]}, [%[dst2]], %[stride_dst] \n\t" ++ "beq 3f \n\t" ++ "vld1.8 {d0}, [%[src]]! \n\t" ++ "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[6]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #8 \n\t" ++ "vst1.8 {d2[7]}, [%[dst2]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[6]}, [%[dst]] \n\t" ++ "vst1.8 {d0[7]}, [%[dst2]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[6]}, [%[dst]] \n\t" ++ "vst1.8 {d2[7]}, [%[dst2]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [dst]"+r"(dst), ++ [dst2]"+r"(dst2), ++ [src]"+r"(src), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ } ++} ++ ++static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src) ++{ ++ int x, y; ++ switch (pixel_shift) ++ { ++ case 2: ++ __asm__ volatile ( ++ "ldr %[x], [%[src]], %[stride_src] \n\t" ++ "ldr %[y], [%[src]], %[stride_src] \n\t" ++ "str %[x], [%[dst]], %[stride_dst] \n\t" ++ "sub %[height], #2 \n\t" ++ "1: \n\t" ++ "ldr %[x], [%[src]], %[stride_src] \n\t" ++ "str %[y], [%[dst]], %[stride_dst] \n\t" ++ "ldr %[y], [%[src]], %[stride_src] \n\t" ++ "subs %[height], #2 \n\t" ++ "str %[x], [%[dst]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "str %[y], [%[dst]] \n\t" ++ : // Outputs ++ [x]"=&r"(x), ++ [y]"=&r"(y), ++ [src]"+r"(src), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src), ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ case 1: ++ __asm__ volatile ( ++ "ldrh %[x], [%[src]], %[stride_src] \n\t" ++ "ldrh %[y], [%[src]], %[stride_src] \n\t" ++ "strh %[x], [%[dst]], %[stride_dst] \n\t" ++ "sub %[height], #2 \n\t" ++ "1: \n\t" ++ "ldrh %[x], [%[src]], %[stride_src] \n\t" ++ "strh %[y], [%[dst]], %[stride_dst] \n\t" ++ "ldrh %[y], [%[src]], %[stride_src] \n\t" ++ "subs %[height], #2 \n\t" ++ "strh %[x], [%[dst]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "strh %[y], [%[dst]] \n\t" ++ : // Outputs ++ [x]"=&r"(x), ++ [y]"=&r"(y), ++ [src]"+r"(src), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src), ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ default: ++ __asm__ volatile ( ++ "ldrb %[x], [%[src]], %[stride_src] \n\t" ++ "ldrb %[y], [%[src]], %[stride_src] \n\t" ++ "strb %[x], [%[dst]], %[stride_dst] \n\t" ++ "sub %[height], #2 \n\t" ++ "1: \n\t" ++ "ldrb %[x], [%[src]], %[stride_src] \n\t" ++ "strb %[y], [%[dst]], %[stride_dst] \n\t" ++ "ldrb %[y], [%[src]], %[stride_src] \n\t" ++ "subs %[height], #2 \n\t" ++ "strb %[x], [%[dst]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "strb %[y], [%[dst]] \n\t" ++ : // Outputs ++ [x]"=&r"(x), ++ [y]"=&r"(y), ++ [src]"+r"(src), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src), ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ } ++} ++ ++#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon ++static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src) ++{ ++ if (stride_dst == 1 << pixel_shift) ++ ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src); ++ else if (stride_src == 1 << pixel_shift) ++ ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst); ++ else ++ ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src); ++} ++ ++#endif /* HAVE_NEON_INLINE */ ++ ++#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */ +diff --git a/libavcodec/arm/rpi_hevc_mv_arm.h b/libavcodec/arm/rpi_hevc_mv_arm.h +new file mode 100644 +index 0000000000..325c26a49b +--- /dev/null ++++ b/libavcodec/arm/rpi_hevc_mv_arm.h +@@ -0,0 +1,93 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Written by John Cox, Ben Avison ++*/ ++ ++#ifndef AVCODEC_ARM_RPI_HEVC_MV_H ++#define AVCODEC_ARM_RPI_HEVC_MV_H ++ ++#if HAVE_ARMV6T2_INLINE ++static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b) ++{ ++ MvXY r; ++ __asm__ ( ++ "sadd16 %[r], %[a], %[b] \n\t" ++ : [r]"=r"(r) ++ : [a]"r"(a), ++ [b]"r"(b) ++ : ++ ); ++ return r; ++} ++#define mvxy_add mvxy_add_arm ++#endif ++ ++#if HAVE_ARMV6T2_INLINE ++#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV)) ++static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb) ++{ ++ int t; ++ __asm__ ( ++ "ssat %[td], #8, %[td] \n\t" ++ "ssat %[tb], #8, %[tb] \n\t" ++ "eor %[t], %[td], %[td], asr #31 \n\t" ++ "adds %[t], %[t], %[td], lsr #31 \n\t" ++ "asr %[t], #1 \n\t" ++ "add %[t], #0x4000 \n\t" ++ "it ne \n\t" ++ "sdivne %[t], %[t], %[td] \n\t" ++ "mov %[td], #32 \n\t" ++ "smlabb %[td], %[t], %[tb], %[td] \n\t" ++ "ssat %[td], #13, %[td], asr #6 \n\t" ++ "mov %[tb], #127 \n\t" ++ "smlatb %[t], %[xy], %[td], %[tb] \n\t" ++ "smlabb %[tb], %[xy], %[td], %[tb] \n\t" ++// This takes the sign of x & y for rounding at the "wrong" point ++// (i.e. after adding 127) but for the range of values (-1,-127) ++// where it does the wrong thing you get the right answer (0) anyway ++ "add %[t], %[t], %[t], lsr #31 \n\t" ++ "add %[xy], %[tb], %[tb], lsr #31 \n\t" ++ "ssat %[t], #16, %[t], asr #8 \n\t" ++ "ssat %[xy], #16, %[xy], asr #8 \n\t" ++ "pkhbt %[xy], %[xy], %[t], lsl #16 \n\t" ++ : ++ [t]"=&r"(t), ++ [xy]"+r"(xy), ++ [td]"+r"(td), ++ [tb]"+r"(tb) ++ : ++ : ++ "cc" ++ ); ++ return xy; ++} ++#define mv_scale_xy mv_scale_xy_arm ++#endif ++#endif ++ ++#endif // AVCODEC_ARM_RPI_HEVC_MV_H ++ +diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h +new file mode 100644 +index 0000000000..62b9326532 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_arm.h +@@ -0,0 +1,26 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_ARM_HEVCDSP_ARM_H ++#define AVCODEC_ARM_HEVCDSP_ARM_H ++ ++#include "libavcodec/rpi_hevcdsp.h" ++ ++void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth); ++ ++#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */ +diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S +new file mode 100644 +index 0000000000..88a3b4e5e7 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S +@@ -0,0 +1,1634 @@ ++/* ++ * Copyright (c) 2014 Seppo Tomperi ++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1 ++ */ ++ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8 ++ vsubl.u8 q0, \Q0a, \P0a ++ vsubl.u8 q1, \P1a, \Q1a ++ vdup.16 d4, r2 ++ \I1 ++ vshl.i16 q0, #2 ++ \I2 ++ vadd.i16 q0, q1 ++ \I3 ++ vmovl.u8 q2, d4 ++ \I4 ++ vneg.s16 q1, q2 ++ \I5 ++ vrshr.s16 q0, #3 ++ \I6 ++ \I7 ++ \I8 ++ vmin.s16 q0, q2 ++ vmovl.u8 q2, \Q0a ++ vmax.s16 q0, q1 ++ vaddw.u8 q1, q0, \P0a ++ vsub.i16 q0, q2, q0 ++ vqmovun.s16 \P0a, q1 ++ vqmovun.s16 \Q0a, q0 ++.endm ++ ++ ++.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7 ++ vsubl.u8 q0, \Q0a, \P0a @ q0a - p0a ++ lsr r12, r2, #16 ++ vsubl.u8 q1, \Q0b, \P0b @ q0b - p0b ++ vsubl.u8 q2, \P1a, \Q1a @ p1a - q1a ++ vsubl.u8 q3, \P1b, \Q1b @ p1b - q1b ++ vshl.i16 q0, #2 @ (q0a - p0a) * 4 ++ vshl.i16 q1, #2 @ (q0b - p0b) * 4 ++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a ++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b ++ vdup.16 d4, r2 @ tc0a, tc0b ++ vdup.16 d6, r12 @ tc1a, tc1b ++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3 ++ \I1 ++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3 ++ \I2 ++ vmovl.u8 q2, d4 @ tc0a, tc0b ++ \I3 ++ vmovl.u8 q3, d6 @ tc1a, tc1b ++ \I4 ++ vmin.s16 q0, q2 ++ \I5 ++ vneg.s16 q2, q2 @ -tc0a, -tc0b ++ \I6 ++ vmin.s16 q1, q3 ++ \I7 ++ vneg.s16 q3, q3 @ -tc1a, -tc1b ++ vmax.s16 q0, q2 @ delta0a ++ vmovl.u8 q2, \Q0a ++ vmax.s16 q1, q3 @ delta0b ++ vaddw.u8 q3, q0, \P0a @ p0a + delta0a ++ vsub.i16 q0, q2, q0 @ q0a - delta0a ++ vmovl.u8 q2, \Q0b ++ vsub.i16 q2, q1 @ q0b - delta0b ++ vaddw.u8 q1, \P0b @ p0b + delta0b ++ vqmovun.s16 \Q0a, q0 ++ vqmovun.s16 \P0a, q3 ++ vqmovun.s16 \Q0b, q2 ++ vqmovun.s16 \P0b, q1 ++.endm ++ ++ ++@ Preserves r12 ++@ Clobbers r2 ++@ P0a et al all contain UVUVUVUV ++@ r2 (tc4) contains ++@ [0..7] tc U a ++@ [8..15] tc V a ++ ++.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8 ++ vsub.i16 q0, \Q0a, \P0a ++ vsub.i16 q1, \P1a, \Q1a ++ vdup.16 d4, r2 ++ \I1 ++ vshl.i16 q0, #2 ++ \I2 ++ vadd.i16 q0, q1 ++ \I3 ++ vshll.u8 q2, d4, #\bit_depth - 8 ++ \I4 ++ vneg.s16 q1, q2 ++ \I5 ++ vrshr.s16 q0, #3 ++ \I6 ++ \I7 ++ \I8 ++ vmin.s16 q0, q2 ++ vmov.i16 q2, #0 ++ vmax.s16 q0, q1 ++ vadd.i16 \P0a, q0 ++ vsub.i16 \Q0a, q0 ++ vmov.i16 q1, #(1 << \bit_depth) - 1 ++ vmax.s16 \P0a, q2 ++ vmax.s16 \Q0a, q2 ++ vmin.s16 \P0a, q1 ++ vmin.s16 \Q0a, q1 ++.endm ++ ++@ Clobbers r2, r12 ++@ P0a et al all contain UVUVUVUV ++@ r2 (tc4) contains ++@ [0..7] tc U a ++@ [8..15] tc V a ++@ [16..23] tc U b ++@ [24..31] tc V b ++ ++.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7 ++ vsub.i16 q0, \Q0a, \P0a @ q0a - p0a ++ lsr r12, r2, #16 ++ vsub.i16 q1, \Q0b, \P0b @ q0b - p0b ++ vsub.i16 q2, \P1a, \Q1a @ p1a - q1a ++ vsub.i16 q3, \P1b, \Q1b @ p1b - q1b ++ vshl.i16 q0, #2 @ (q0a - p0a) * 4 ++ vshl.i16 q1, #2 @ (q0b - p0b) * 4 ++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a ++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b ++ vdup.16 d4, r2 @ tc0a, tc0b ++ vdup.16 d6, r12 @ tc1a, tc1b ++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3 ++ \I1 ++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3 ++ \I2 ++ vshll.u8 q2, d4, #\bit_depth - 8 @ tc0a, tc0b ++ \I3 ++ vshll.u8 q3, d6, #\bit_depth - 8 @ tc1a, tc1b ++ \I4 ++ vmin.s16 q0, q2 ++ \I5 ++ vneg.s16 q2, q2 @ -tc0a, -tc0b ++ \I6 ++ vmin.s16 q1, q3 ++ \I7 ++ vneg.s16 q3, q3 @ -tc1a, -tc1b ++ vmax.s16 q0, q2 @ delta0a ++ vadd.i16 \P0a, q0 @ p0a + delta0a ++ vsub.i16 \Q0a, q0 @ q0a - delta0a ++ vmax.s16 q1, q3 @ delta0b ++ vadd.i16 \P0b, q1 @ p0b + delta0b ++ vsub.i16 \Q0b, q1 @ q0b - delta0b ++ vmov.i16 q2, #0 ++ vmov.i16 q3, #(1 << \bit_depth) - 1 ++ vmax.s16 \P0a, q2 ++ vmax.s16 \Q0a, q2 ++ vmax.s16 \P0b, q2 ++ vmax.s16 \Q0b, q2 ++ vmin.s16 \P0a, q3 ++ vmin.s16 \Q0a, q3 ++ vmin.s16 \P0b, q3 ++ vmin.s16 \Q0b, q3 ++.endm ++ ++ ++ ++@ uint8_t *_no_p, [sp+0] ++@ uint8_t *_no_q) [sp+4] ++ ++.macro hevc_loop_filter_luma_start ++ ldr r12, [r3] ++ ldr r3, [r3, #4] ++ orrs r3, r12, r3, lsl #16 ++ it eq ++ bxeq lr ++ push {r4-r10,lr} @ 32 bytes ++ ldrd r4, r5, [sp, #32] @ &_no_p ++ ldrb r4, [r4] ++ ldrb r5, [r5] ++ movs r10, r4 ++ it ne ++ movne r10, #1 ++ cmp r5, #0 ++ it ne ++ orrne r10, #2 ++.endm ++ ++@ Input: ++@ r2 beta (raw: needs shift for bitdepth > 8) ++@ r3[ 0:15] tc[0] (raw: needs shift for bitdepth > 8) ++@ r3[16:31] tc[1] (raw: needs shift for bitdepth > 8) ++@ ++@ Input & output ++@ 8-bit: d16-d23 (Q3,Q2,Q1,Q0,P0,P1,P2,P3) ++@ 16-bit: q8-q15 ++@ ++@ r1 -r1 ++@ r10 b1->C, b0->N (r10 junk) ++@ ++@ Junks: ++@ r5, r6, r7, r8, r9 ++ ++.macro m_filter_luma bit_depth, Q11, Q15 ++.if \bit_depth == 8 ++ vmovl.u8 q14, d22 @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2 ++ vmovl.u8 q13, d21 @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1 ++ vmovl.u8 q12, d20 @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0 ++ vmovl.u8 \Q11, d19 @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0 ++ vmovl.u8 q10, d18 @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1 ++ vmovl.u8 q9, d17 @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2 ++.endif ++ vadd.i16 q0, q9, \Q11 @ P2 + P0 ++.if \bit_depth > 8 ++ lsl r3, r3, #(\bit_depth - 8) ++.endif ++ vadd.i16 q1, q14, q12 @ Q2 + Q0 ++.if \bit_depth > 8 ++ lsl r2, r2, #(\bit_depth - 8) ++.endif ++ vsub.i16 q0, q10 @ P2 - P1 + P0 ++ lsr r5, r3, #16 ++ vsub.i16 q1, q13 @ Q2 - Q1 + Q0 ++.if \bit_depth == 8 ++ vmovl.u8 q8, d16 @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3 ++ vmovl.u8 \Q15, d23 @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3 ++.endif ++ vabd.s16 q0, q10 @ dp0 = abs(P2 - 2 * P1 + P0) ++ vabd.s16 q1, q13 @ dq0 = abs(Q2 - 2 * Q1 + Q0) ++ vmov.i64 q2, #0xffffffff0000 ++ vbic q0, q2 @ only dp0(') and dp3(') ++ vbic q1, q2 @ only dq0(') and dq3(') ++ vsra.u64 q0, #16 ++ vsra.u64 q1, #16 ++ vdup.16 q3, r2 @ beta ++ vdup.16 d14, r3 @ tC[0] ++ vdup.16 d15, r5 @ tC[1] ++ vabd.s16 q4, q8, \Q11 @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0) ++ vmovn.i32 d0, q0 @ dp3' dp0' dp3 dp0 ++ vmovn.i32 d1, q1 @ dq3' dq0' dq3 dq0 ++ vadd.i16 d5, d0, d1 @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0 ++ vabd.s16 q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0) ++ vaba.s16 q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0) ++ vpadd.i16 d2, d5, d5 @ dontcare dontcare d0'+d3' d0+d3 ++ vshl.s16 q6, q7, #2 @ tC[] * 4 ++ vrhadd.s16 q6, q7 @ tc25 = (tc[] * 5 + 1) >> 1 ++ vcgt.s16 d2, d6, d2 @ if (d0 + d3 < beta) ++ vmov r7, s4 @ (d2) r7 = mask of blocks to apply filtering (16b/block) ++ vshr.s16 q1, q3, #3 @ beta_3 = beta >> 3 ++ cmp r7, #0 ++ beq .Lbypasswrite ++ ++ vcgt.s16 q5, q6, q5 @ if < tc25 ++ vcgt.s16 q4, q1, q4 @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3) ++ vand q4, q5 ++ vbic d8, d4 ++ vbic d9, d4 ++ vshr.s16 q3, #2 @ beta_2 = beta >> 2 ++ vsra.u64 q4, #16 ++ vshl.s16 d5, #1 @ d3'<<1 d0'<<1 d3<<1 d0<<1 ++ vshl.i16 q7, #1 @ tc2 = tC[] << 1 ++ vcgt.s16 d6, d5 @ if (d3'<<1 < beta_2) etc ++ vmovn.i32 d8, q4 @ beta_3 && tc25 tests, prime block in ms half ++ vand d6, d8 @ && beta_2 tests, prime in ms half ++ vpadd.i16 d0, d1 @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3 ++ vneg.s16 q6, q7 @ -tc2 ++ vmovn.i32 d8, q3 ++ vshrn.i32 d6, q3, #16 ++ vand d6, d8 ++ vmov r5, r6, d0 @ r5 = dp0'+dp3' dp0+dp3 r6 = dq0'+dq3' dq0+dq3 ++ vmov r8, s12 @ (d6) r8 = mask of strong filtering blocks (16b/block) ++ vadd.i16 q0, \Q11, q12 @ p0 + q0 ++ ands r9, r7, r8 ++ beq 1f ++ ++ vadd.i16 q2, q0, q10 @ p1 + p0 + q0 ++ vadd.i16 q3, q0, q13 @ p0 + q0 + q1 ++ lsr r3, r9, #16 ++ vadd.i16 q1, q2, q9 @ p2 + p1 + p0 + q0 (new P1 before clipping) ++ vadd.i16 q4, q3, q14 @ p0 + q0 + q1 + q2 (new Q1 before clipping) ++ vadd.i16 q0, q8, q9 @ p3 + p2 ++ vadd.i16 q5, \Q15, q14 @ q2 + q3 ++ vadd.i16 q2, q1 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 ++ vadd.i16 q3, q4 @ 2 * p0 + 2 * q0 + 2 * q1 + q2 ++ vshl.i16 q0, #1 @ 2 * p3 + 2 * p2 ++ vshl.i16 q5, #1 @ 2 * q2 + 2 * q3 ++ vadd.i16 q0, q1 @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping) ++ vadd.i16 q5, q4 @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping) ++ vadd.i16 q2, q13 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping) ++ vadd.i16 q3, q10 @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping) ++ vrshr.s16 q0, #3 @ scale, with rounding ++ vrshr.s16 q5, #3 ++ vrshr.s16 q1, #2 ++ vrshr.s16 q4, #2 ++ vrshr.s16 q2, #3 ++ vrshr.s16 q3, #3 ++ vsub.i16 q0, q9 @ find difference ++ vsub.i16 q5, q14 ++ vsub.i16 q1, q10 ++ vsub.i16 q4, q13 ++ vsub.i16 q2, \Q11 ++ vsub.i16 q3, q12 ++ vmax.s16 q0, q6 @ clip difference to -tc2 .. tc2 ++ vmax.s16 q5, q6 ++ vmax.s16 q1, q6 ++ vmax.s16 q4, q6 ++ vmax.s16 q2, q6 ++ vmax.s16 q3, q6 ++ vdup.16 d12, r9 @ expand mask, reuse q6 due to register pressure ++ vdup.16 d13, r3 ++ vmin.s16 q0, q7 ++ vmin.s16 q5, q7 ++ vmin.s16 q1, q7 ++ vmin.s16 q4, q7 ++ vmin.s16 q2, q7 ++ vmin.s16 q3, q7 ++ vadd.i16 q0, q9 @ apply difference ++ vadd.i16 q5, q14 ++ vadd.i16 q1, q10 ++ vadd.i16 q4, q13 ++ vadd.i16 q2, \Q11 ++ vadd.i16 q3, q12 ++ vbit q9, q0, q6 @ apply filtered values according to mask ++ vbit q14, q5, q6 ++ vbit q10, q1, q6 ++ vbit q13, q4, q6 ++ vbit \Q11, q2, q6 ++ vbit q12, q3, q6 ++ vneg.s16 q6, q7 @ restore -tc2 ++ ++1: ++ bics r9, r7, r8 ++ beq 2f ++ ++ vsub.i16 q0, q12, \Q11 @ q0 - p0 ++ vsub.i16 q1, q13, q10 @ q1 - p1 ++ lsr r3, r9, #16 ++ vshl.i16 q2, q0, #3 ++ lsr r7, r5, #16 ++ vadd.i16 q3, q0, q2 @ 9 * (q0 - p0) ++ lsr r8, r6, #16 ++ vshl.i16 q2, q1, #1 ++ vadd.i16 q4, q1, q2 @ 3 * (q1 - p1) ++ vshr.s16 q6, #1 @ -tc = -tc2 >> 1 ++ vsub.i16 q5, q3, q4 ++ vrhadd.s16 q1, q9, \Q11 @ (p2 + p0 + 1) >> 1 ++ vrhadd.s16 q3, q14, q12 @ (q2 + q0 + 1) >> 1 ++ vrshr.s16 q5, #4 @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4 ++ vsub.i16 q1, q10 @ ((p2 + p0 + 1) >> 1) - p1 ++ vsub.i16 q3, q13 @ ((q2 + q0 + 1) >> 1) - q1 ++ vmax.s16 q6, q5 @ ++ vshr.s16 q4, q7, #1 @ tc = tc2 >> 1 ++ vdup.16 q0, r2 @ beta ++ vmin.s16 q6, q4 @ delta0 clamped to [-tc, tc] ++ vshr.s16 q4, #1 @ tc_2 = tc >> 1 ++ vhadd.s16 q1, q6 @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1 ++ vhsub.s16 q3, q6 @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1 ++ vshr.s16 q2, q0, #1 @ beta >> 1 ++ vadd.i16 q2, q0 @ beta + (beta >> 1) ++ vneg.s16 q0, q4 @ -tc_2 ++ vabs.s16 q5, q5 @ abs(original delta0) ++ vshr.s16 q2, #3 @ (beta + (beta >> 1)) >> 3 ++ vmax.s16 q1, q0 ++ vmax.s16 q3, q0 ++ vshl.s16 q0, q7, #2 @ 8 * tc ++ vadd.i16 q7, q0 @ 10 * tc ++ vdup.16 d0, r9 ++ vdup.16 d1, r3 @ q0 = mask of blocks to apply filtering ++ vmin.s16 q1, q4 @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2) ++ vmin.s16 q3, q4 @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2) ++ vdup.16 d8, r5 @ dp0 + dp3 ++ vdup.16 d9, r7 @ dp0' + dp3' ++ vcgt.s16 q7, q5 @ if ((10 * tc) > abs(delta0)) ++ vdup.16 d10, r6 @ dq0 + dq3 ++ vdup.16 d11, r8 @ dq0' + dq3' ++ vand q7, q0 @ AND block and line masks ++ vcgt.s16 q4, q2, q4 @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1) ++ vadd.i16 q0, q1, q10 @ p1 + deltap1 ++ vcgt.s16 q5, q2, q5 @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1) ++ vadd.i16 q3, q3, q13 @ q1 + deltaq1 ++ vadd.i16 q1, \Q11, q6 @ p0 + delta0 ++ vsub.i16 q2, q12, q6 @ q0 - delta0 ++ vand q4, q7 @ AND nd_p test with block/line masks ++ vand q5, q7 @ AND nd_q test with block/line masks ++ vbit q10, q0, q4 ++ vbit \Q11, q1, q7 ++ vbit q12, q2, q7 ++ vbit q13, q3, q5 ++ ++2: ++.if \bit_depth == 8 ++ vmovn.i16 d16, q8 ++ vmovn.i16 d23, \Q15 ++ neg r1, r1 ++ vqmovun.s16 d17, q9 ++ vqmovun.s16 d18, q10 ++ vqmovun.s16 d19, \Q11 ++ lsls r10, #31 ++ vqmovun.s16 d20, q12 ++ vqmovun.s16 d21, q13 ++ vqmovun.s16 d22, q14 ++.else ++ vmov.i16 q0, #0 ++ vmov.i16 q1, #(1 << \bit_depth - 1) ++ @ q8 & q15 should be unaltered and so don't require clipping ++ neg r1, r1 ++ vmax.s16 q9, q0 ++ vmax.s16 q10, q0 ++ vmax.s16 q11, q0 ++ vmax.s16 q12, q0 ++ vmax.s16 q13, q0 ++ vmax.s16 q14, q0 ++ lsls r10, #31 ++ vmin.s16 q9, q1 ++ vmin.s16 q10, q1 ++ vmin.s16 q11, q1 ++ vmin.s16 q12, q1 ++ vmin.s16 q13, q1 ++ vmin.s16 q14, q1 ++.endif ++ bx lr ++.endm ++ ++function hevc_loop_filter_luma_body ++ m_filter_luma 8, q15, q11 ++endfunc ++ ++@ void ff_hevc_rpi_v_loop_filter_luma_neon_8( ++@ uint8_t *_pix, [r0] ++@ ptrdiff_t _stride, [r1] ++@ int _beta, [r2] ++@ int *_tc, [r3] ++@ uint8_t *_no_p, [sp+0] ++@ uint8_t *_no_q) [sp+4] ++ ++function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1 ++ hevc_loop_filter_luma_start ++ ++ sub r4, r0, #4 ++ b .Lv_loop_luma_common ++endfunc ++ ++@ void ff_hevc_rpi_v_loop_filter2_luma_neon( ++@ uint8_t * pix_r, [r0] ++@ ptrdiff_t _stride, [r1] ++@ int _beta, [r2] ++@ int tc2, [r3] ++@ int no_f, [sp+0] ++@ uint8_t * pix_l) [sp+4] ++ ++function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1 ++ cmp r3, #0 ++ it eq ++ bxeq lr ++ push {r4-r10,lr} @ 32 bytes ++ ldr r4, [sp, #36] ++ ldr r10, [sp, #32] ++ ++.Lv_loop_luma_common: ++ vpush {d8-d15} ++ ++ @ It's slightly faster to do unlaned loads and transpose in the ++ @ 8-bit case, even though it needs more instructions, because ++ @ VLD4.8 is a really slow way to read from memory. ++ vld1.32 {d16[0]}, [r4:32], r1 ++ vld1.32 {d20[0]}, [r0:32], r1 ++ vld1.32 {d16[1]}, [r4:32], r1 ++ vld1.32 {d20[1]}, [r0:32], r1 ++ vld1.32 {d17[0]}, [r4:32], r1 ++ vld1.32 {d21[0]}, [r0:32], r1 ++ vld1.32 {d17[1]}, [r4:32], r1 ++ vld1.32 {d21[1]}, [r0:32], r1 ++ vld1.32 {d18[0]}, [r4:32], r1 ++ vld1.32 {d22[0]}, [r0:32], r1 ++ vld1.32 {d18[1]}, [r4:32], r1 ++ vld1.32 {d22[1]}, [r0:32], r1 ++ vld1.32 {d19[0]}, [r4:32], r1 ++ vld1.32 {d23[0]}, [r0:32], r1 ++ vld1.32 {d19[1]}, [r4:32] ++ vld1.32 {d23[1]}, [r0:32] ++ vuzp.16 q8, q9 ++ vuzp.16 q10, q11 ++ vuzp.8 q8, q9 ++ vuzp.8 q10, q11 ++ vswp d17, d18 ++ vswp d21, d22 ++ ++ bl hevc_loop_filter_luma_body ++ ++ add r6, r4, r1 ++ add r2, r0, r1 ++ lsl r1, #1 ++ ++ vpop {d8-d15} ++ ++ @ no_p[1] ++ bmi 1f ++ vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1 ++ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1 ++ vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1 ++ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1 ++ ++ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1 ++ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1 ++ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1 ++ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r6:32] ++1: ++ @ no_q[1] ++ bcs 1f ++ vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1 ++ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1 ++ vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1 ++ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1 ++ ++ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1 ++ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1 ++ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1 ++ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32] ++1: ++ pop {r4-r10,pc} ++ ++.Lbypasswrite: ++ vpop {d8-d15} ++ pop {r4-r10,pc} ++endfunc ++ ++.macro m_filter_v_luma_16 bit_depth ++ vpush {d8-d15} ++ ++ @ Uses slightly fewer instructions to do laned loads than unlaned ++ @ and transpose. This also means that we can use the same code for ++ @ both split & unsplit deblock ++ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r4], r1 ++ vld4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0], r1 ++ ++ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1 ++ vld4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1 ++ ++ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r4], r1 ++ vld4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1 ++ ++ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1 ++ vld4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1 ++ ++ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4], r1 ++ vld4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1 ++ ++ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1 ++ vld4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 ++ ++ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1 ++ vld4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1 ++ ++ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4] ++ vld4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0] ++ ++ bl hevc_loop_filter_luma_body_\bit_depth ++ ++ add r6, r4, r1 ++ add r2, r0, r1 ++ lsl r1, #1 ++ ++ vpop {d8-d15} ++ ++ @ p[1] ++ bmi 1f ++ vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4], r1 ++ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r6], r1 ++ vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1 ++ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r6], r1 ++ vst4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1 ++ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r6], r1 ++ vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1 ++ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r6] ++1: ++ @ q[1] ++ bcs 1f ++ vst4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0], r1 ++ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r2], r1 ++ vst4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 ++ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r2], r1 ++ vst4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1 ++ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r2], r1 ++ vst4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1 ++ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r2] ++1: ++ pop {r4-r10,pc} ++.endm ++ ++ ++ ++ ++@ void (*hevc_h_loop_filter_luma)(uint8_t *pix, [r0] ++@ ptrdiff_t stride, [r1] ++@ int beta, [r2] ++@ int32_t *tc, [r3] ++@ uint8_t *no_p, sp[0] ++@ uint8_t *no_q); sp[4] ++@ ++@ Src should always be on 8 byte boundry & all in the same slice ++ ++function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1 ++ hevc_loop_filter_luma_start ++ b .Lh_loop_filter_luma_common_8 ++endfunc ++ ++function ff_hevc_rpi_h_loop_filter_luma2_neon_8, export=1 ++ cmp r3, #0 ++ it eq ++ bxeq lr ++ push {r4-r10,lr} @ 32 bytes ++ ldr r10, [sp, #32] ++ ++.Lh_loop_filter_luma_common_8: ++ sub r4, r0, r1, lsl #2 ++ add r0, r4, r1 ++ lsl r1, #1 ++ vpush {d8-d15} ++ ++ vld1.8 {d16}, [r4], r1 ++ vld1.8 {d17}, [r0], r1 ++ vld1.8 {d18}, [r4], r1 ++ vld1.8 {d19}, [r0], r1 ++ vld1.8 {d20}, [r4], r1 ++ vld1.8 {d21}, [r0], r1 ++ vld1.8 {d22}, [r4] ++ vld1.8 {d23}, [r0] ++ ++ bl hevc_loop_filter_luma_body ++ ++ add r0, r0, r1, lsl #1 ++ add r2, r4, r1, lsl #1 ++ add r6, r4, r1, asr #1 ++ vpop {d8-d15} ++ ++ @ P2-P0 ++ bcs 1f ++ vst1.8 {d22}, [r4], r1 ++ vst1.8 {d21}, [r6] ++ vst1.8 {d20}, [r4] ++1: ++ @ Q0-Q2 ++ bmi 1f ++ vst1.8 {d19}, [r0], r1 ++ vst1.8 {d18}, [r2] ++ vst1.8 {d17}, [r0] ++1: ++ pop {r4-r10,pc} ++endfunc ++ ++ ++.macro m_filter_h_luma_16 bit_depth ++ sub r4, r0, r1, lsl #2 ++ add r0, r4, r1 ++ lsl r1, #1 ++ vpush {d8-d15} ++ ++ vld1.16 { q8}, [r4], r1 ++ vld1.16 { q9}, [r0], r1 ++ vld1.16 {q10}, [r4], r1 ++ vld1.16 {q11}, [r0], r1 ++ vld1.16 {q12}, [r4], r1 ++ vld1.16 {q13}, [r0], r1 ++ vld1.16 {q14}, [r4] ++ vld1.16 {q15}, [r0] ++ ++ bl hevc_loop_filter_luma_body_\bit_depth ++ ++ add r0, r0, r1, lsl #1 ++ add r2, r4, r1, lsl #1 ++ add r6, r4, r1, asr #1 ++ vpop {d8-d15} ++ ++ @ P2-P0 ++ bcs 1f ++ vst1.16 {q14}, [r4], r1 ++ vst1.16 {q13}, [r6] ++ vst1.16 {q12}, [r4] ++1: ++ bmi 1f ++ vst1.16 {q11}, [r0], r1 ++ vst1.16 {q10}, [r2] ++ vst1.16 { q9}, [r0] ++1: ++ pop {r4-r10,pc} ++.endm ++ ++ ++@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ unsigned int no_f); // r3 ++@ ++@ no_f ++@ 0 tl P0 ++@ 1 tr P1 ++@ 2 bl Q0 ++@ 3 br Q1 ++@ ++@ Probably not worth having the P/Qa only special case in this direction ++@ Given layout we won't save any memory reads or avoid any cache dirtying ++@ We would save a bit of computation but I expect the partials to be less ++@ common in the H direction than V due to how we arrange deblock. ++ ++function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1 ++ sub r12, r0, r1 ++ cmp r2, #0 ++ it eq ++ bxeq lr ++ vld1.8 {d26,d27}, [r0] ++ lsl r1, #1 ++ sub r0, r1 ++ vld1.8 {d18,d19}, [r12], r1 ++ vld1.8 {d16,d17}, [r0], r1 ++ vld1.8 {d28,d29}, [r12] ++ ++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \ ++ "sub r12, r0, r1, asr #1" ++ ++ lsls r3, #29 @ b2 -> N, b3 -> C ++ it pl ++ vstrpl d26, [r0, #0] ++ it cc ++ vstrcc d27, [r0, #8] ++ lsls r3, #2 @ b0 -> N, b1 -> C ++ it pl ++ vstrpl d18, [r12, #0] ++ it cc ++ vstrcc d19, [r12, #8] ++ bx lr ++ ++endfunc ++ ++ ++@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ unsigned int no_f); // r3 ++@ ++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] ++@ ++@ Macro here actual function near bottom ++ ++.macro m_filter_h_uv_16 bit_depth ++ sub r12, r0, r1 ++ cmp r2, #0 ++ it eq ++ bxeq lr ++ vld1.16 {q12, q13}, [r0] ++ lsl r1, #1 ++ sub r0, r1 ++ vld1.16 {q10, q11}, [r12], r1 ++ vld1.16 {q8, q9 }, [r0], r1 ++ vld1.16 {q14, q15}, [r12] ++ ++ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \ ++ "sub r12, r0, r1, asr #1", \ ++ "cmp r3, #0" ++ ++ bne 1f ++ vst1.16 {q10, q11}, [r12] ++ vst1.16 {q12, q13}, [r0] ++ bx lr ++ ++ @ At least one no_f bit is set ++ @ Which means we need to break this apart in an ugly fashion ++1: ++ lsls r3, #29 @ b2 -> N, b3 -> C ++ itt pl ++ vstrpl d24, [r0, #0] ++ vstrpl d25, [r0, #8] ++ itt cc ++ vstrcc d26, [r0, #16] ++ vstrcc d27, [r0, #24] ++ lsls r3, #2 @ b0 -> N, b1 -> C ++ itt pl ++ vstrpl d20, [r12, #0] ++ vstrpl d21, [r12, #8] ++ itt cc ++ vstrcc d22, [r12, #16] ++ vstrcc d23, [r12, #24] ++ bx lr ++.endm ++ ++ ++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ uint8_t * src_l, // r3 ++@ unsigned int no_f); // sp[0] ++@ ++@ no_f: ++@ 0 tl P0 ++@ 1 tr Q0 ++@ 2 bl P1 ++@ 3 br Q1 ++ ++function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1 ++ cmp r2, #0 ++ it eq ++ bxeq lr ++ push {lr} ++ vld2.16 {d16[0], d18[0]}, [r3], r1 ++ vld2.16 {d20[0], d22[0]}, [r0], r1 ++ ++ cmp r2, #0x10000 ++ vld2.16 {d16[1], d18[1]}, [r3], r1 ++ vld2.16 {d20[1], d22[1]}, [r0], r1 ++ ++ vld2.16 {d16[2], d18[2]}, [r3], r1 ++ vld2.16 {d20[2], d22[2]}, [r0], r1 ++ ++ vld2.16 {d16[3], d18[3]}, [r3], r1 ++ vld2.16 {d20[3], d22[3]}, [r0], r1 ++ blo 10f ++ ++ vld2.16 {d17[0], d19[0]}, [r3], r1 ++ vld2.16 {d21[0], d23[0]}, [r0], r1 ++ ++ sub ip, r0, r3 ++ vld2.16 {d17[1], d19[1]}, [r3], r1 ++ vld2.16 {d21[1], d23[1]}, [r0], r1 ++ ++ cmp ip, #4 ++ vld2.16 {d17[2], d19[2]}, [r3], r1 ++ vld2.16 {d21[2], d23[2]}, [r0], r1 ++ ++ vld2.16 {d17[3], d19[3]}, [r3] ++ vld2.16 {d21[3], d23[3]}, [r0] ++ ++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \ ++ "ldr lr, [sp, #4]", \ ++ "neg r1, r1", \ ++ "it eq; cmpeq lr, #0", \ ++ "add r3, #2", \ ++ "add ip, r3, r1", \ ++ "add r2, r0, r1", \ ++ "lsl r1, #1" ++ ++ bne 1f ++ ++@ Much/most of the time r0 == r3 + 4 and no_f == 0 ++@ so it is worth having this special case ++ vst2.16 {d19[3], d21[3]}, [r3], r1 @ P0b, Q0b ++ vst2.16 {d19[2], d21[2]}, [ip], r1 ++ vst2.16 {d19[1], d21[1]}, [r3], r1 ++ vst2.16 {d19[0], d21[0]}, [ip], r1 ++ vst2.16 {d18[3], d20[3]}, [r3], r1 @ P0a, Q0a ++ vst2.16 {d18[2], d20[2]}, [ip], r1 ++ vst2.16 {d18[1], d20[1]}, [r3] ++ vst2.16 {d18[0], d20[0]}, [ip] ++ pop {pc} ++ ++@ Either split or partial ++1: ++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 ++ ittt cs ++ addcs r0, r0, r1, lsl #1 ++ addcs r2, r2, r1, lsl #1 ++ bcs 1f ++ @ Q0b ++ vst1.16 {d21[3]}, [r0], r1 ++ vst1.16 {d21[2]}, [r2], r1 ++ vst1.16 {d21[1]}, [r0], r1 ++ vst1.16 {d21[0]}, [r2], r1 ++1: ++ ittt mi ++ addmi r3, r3, r1, lsl #1 ++ addmi ip, ip, r1, lsl #1 ++ bmi 1f ++ @ P0b ++ vst1.16 {d19[3]}, [r3], r1 ++ vst1.16 {d19[2]}, [ip], r1 ++ vst1.16 {d19[1]}, [r3], r1 ++ vst1.16 {d19[0]}, [ip], r1 ++1: ++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31 ++ bcs 1f ++ @ Q0a ++ vst1.16 {d20[3]}, [r0], r1 ++ vst1.16 {d20[2]}, [r2], r1 ++ vst1.16 {d20[1]}, [r0] ++ vst1.16 {d20[0]}, [r2] ++1: ++ it mi ++ popmi {pc} ++ @ P0a ++ vst1.16 {d18[3]}, [r3], r1 ++ vst1.16 {d18[2]}, [ip], r1 ++ vst1.16 {d18[1]}, [r3] ++ vst1.16 {d18[0]}, [ip] ++ pop {pc} ++ ++@ Single lump (rather than double) ++10: ++ @ As we have post inced r0/r3 in the load the easiest thing to do is ++ @ to subtract and write forwards, rather than backwards (as above) ++ @ b0 (P0a) -> N, b1 (Q0a) -> C ++ ++ hevc_loop_filter_uv_body1 d16, d18, d20, d22 \ ++ "ldr lr, [sp, #4]", \ ++ "add r3, #2", \ ++ "sub r0, r0, r1, lsl #2", \ ++ "sub r3, r3, r1, lsl #2", \ ++ "lsls lr, #31", \ ++ "add r2, r0, r1", \ ++ "add ip, r3, r1", \ ++ "lsl r1, #1" ++ ++ bcs 3f ++ @ Q0a ++ vst1.16 {d20[0]}, [r0], r1 ++ vst1.16 {d20[1]}, [r2], r1 ++ vst1.16 {d20[2]}, [r0] ++ vst1.16 {d20[3]}, [r2] ++3: ++ it mi ++ popmi {pc} ++ @ P0a ++ vst1.16 {d18[0]}, [r3], r1 ++ vst1.16 {d18[1]}, [ip], r1 ++ vst1.16 {d18[2]}, [r3] ++ vst1.16 {d18[3]}, [ip] ++ pop {pc} ++ ++endfunc ++ ++ ++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ uint8_t * src_l, // r3 ++@ unsigned int no_f); // sp[0] ++@ ++ ++@ no_f ++@ 0 tl P0a ++@ 1 tr Q0a ++@ 2 bl P0b ++@ 3 br Q0b ++ ++@ P1: q8, q12 ++@ P0: q9, q13 ++@ Q0: q10, q14 ++@ Q1: q11, q15 ++ ++.macro m_filter_v_uv2_16 bit_depth ++ cmp r2, #0 ++ it eq ++ bxeq lr ++ push {lr} ++ vld2.32 {d16[0], d18[0]}, [r3], r1 ++ vld2.32 {d20[0], d22[0]}, [r0], r1 ++ ++ cmp r2, #0x10000 ++ vld2.32 {d16[1], d18[1]}, [r3], r1 ++ vld2.32 {d20[1], d22[1]}, [r0], r1 ++ ++ vld2.32 {d17[0], d19[0]}, [r3], r1 ++ vld2.32 {d21[0], d23[0]}, [r0], r1 ++ ++ vld2.32 {d17[1], d19[1]}, [r3], r1 ++ vld2.32 {d21[1], d23[1]}, [r0], r1 ++ blo 10f ++ ++ vld2.32 {d24[0], d26[0]}, [r3], r1 ++ vld2.32 {d28[0], d30[0]}, [r0], r1 ++ ++ sub ip, r0, r3 ++ vld2.32 {d24[1], d26[1]}, [r3], r1 ++ vld2.32 {d28[1], d30[1]}, [r0], r1 ++ ++ cmp ip, #8 ++ vld2.32 {d25[0], d27[0]}, [r3], r1 ++ vld2.32 {d29[0], d31[0]}, [r0], r1 ++ ++ vld2.32 {d25[1], d27[1]}, [r3] ++ vld2.32 {d29[1], d31[1]}, [r0] ++ ++ hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \ ++ "ldr lr, [sp, #4]", \ ++ "neg r1, r1", \ ++ "it eq; cmpeq lr, #0", \ ++ "add r3, #4", \ ++ "add ip, r3, r1", \ ++ "add r2, r0, r1", \ ++ "lsl r1, #1" ++ ++ bne 1f ++ ++@ Much/most of the time r0 == r3 + 8 and no_f == 0 ++@ so it is worth having this special case ++ vst2.32 {d27[1], d29[1]}, [r3], r1 @ P0b, Q0b ++ vst2.32 {d27[0], d29[0]}, [ip], r1 ++ vst2.32 {d26[1], d28[1]}, [r3], r1 ++ vst2.32 {d26[0], d28[0]}, [ip], r1 ++ vst2.32 {d19[1], d21[1]}, [r3], r1 @ P0a, Q0a ++ vst2.32 {d19[0], d21[0]}, [ip], r1 ++ vst2.32 {d18[1], d20[1]}, [r3] ++ vst2.32 {d18[0], d20[0]}, [ip] ++ pop {pc} ++ ++@ Either split or partial ++1: ++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 ++ ittt cs ++ addcs r0, r0, r1, lsl #1 ++ addcs r2, r2, r1, lsl #1 ++ bcs 1f ++ @ Q0b ++ vst1.32 {d29[1]}, [r0], r1 ++ vst1.32 {d29[0]}, [r2], r1 ++ vst1.32 {d28[1]}, [r0], r1 ++ vst1.32 {d28[0]}, [r2], r1 ++1: ++ ittt mi ++ addmi r3, r3, r1, lsl #1 ++ addmi ip, ip, r1, lsl #1 ++ bmi 1f ++ @ P0b ++ vst1.32 {d27[1]}, [r3], r1 ++ vst1.32 {d27[0]}, [ip], r1 ++ vst1.32 {d26[1]}, [r3], r1 ++ vst1.32 {d26[0]}, [ip], r1 ++1: ++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31 ++ bcs 1f ++ @ Q0a ++ vst1.32 {d21[1]}, [r0], r1 ++ vst1.32 {d21[0]}, [r2], r1 ++ vst1.32 {d20[1]}, [r0] ++ vst1.32 {d20[0]}, [r2] ++1: ++ it mi ++ popmi {pc} ++ @ P0a ++ vst1.32 {d19[1]}, [r3], r1 ++ vst1.32 {d19[0]}, [ip], r1 ++ vst1.32 {d18[1]}, [r3] ++ vst1.32 {d18[0]}, [ip] ++ pop {pc} ++ ++@ Single lump (rather than double) ++10: ++ @ As we have post inced r0/r3 in the load the easiest thing to do is ++ @ to subtract and write forwards, rather than backwards (as above) ++ @ b0 (P0a) -> N, b1 (Q0a) -> C ++ ++ hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth, \ ++ "ldr lr, [sp, #4]", \ ++ "add r3, #4", \ ++ "sub r0, r0, r1, lsl #2", \ ++ "sub r3, r3, r1, lsl #2", \ ++ "lsls lr, #31", \ ++ "add r2, r0, r1", \ ++ "add ip, r3, r1", \ ++ "lsl r1, #1" ++ ++ bcs 3f ++ @ Q0a ++ vst1.32 {d20[0]}, [r0], r1 ++ vst1.32 {d20[1]}, [r2], r1 ++ vst1.32 {d21[0]}, [r0] ++ vst1.32 {d21[1]}, [r2] ++3: ++ it mi ++ popmi {pc} ++ @ P0a ++ vst1.32 {d18[0]}, [r3], r1 ++ vst1.32 {d18[1]}, [ip], r1 ++ vst1.32 {d19[0]}, [r3] ++ vst1.32 {d19[1]}, [ip] ++ pop {pc} ++.endm ++ ++ ++@ The NEON version is faster under ideal circumstances (i.e. everything in L1) ++@ But in real world testing it is ~20% slower, presumably due to code size ++ ++#if 0 // NEON version ++ ++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, ++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, ++ * int in_inc0, int in_inc1) ++ */ ++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 ++ mov ip, sp ++ push {a1-a3,v1-v8,lr} ++ ldm ip, {v1-v6} ++ cmp a1, #2 ++ bls 2f ++ vpush {d8-d13} ++ sub v5, v5, #10 ++ sub v6, v6, #10 ++1: ++ vld2.32 {d0[0], d2[0]}, [a3]! ++ vld2.32 {d4[0], d6[0]}, [a4]! ++ vmov.u8 q12, #0 ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb v8, [a3], #1 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[0]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[0]}, [a4], v6 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d16[0]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d20[0]}, [ip] ++ vld1.32 {d18[0]}, [v8] ++ vld1.32 {d22[0]}, [lr] ++ ++ vld2.32 {d0[1], d2[1]}, [a3]! ++ vld2.32 {d4[1], d6[1]}, [a4]! ++ ldrb a2, [a3], #1 ++ vmov.u16 d12, #1 ++ ldrb ip, [a4], #1 ++ vmov.u16 d13, #2 ++ ldrb v8, [a3], #1 ++ vmov.u16 d27, #4 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[2]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[2]}, [a4], v6 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d16[1]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d20[1]}, [ip] ++ vld1.32 {d18[1]}, [v8] ++ vld1.32 {d22[1]}, [lr] ++ ++ vld2.32 {d1[0], d3[0]}, [a3]! ++ vld2.32 {d5[0], d7[0]}, [a4]! ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb lr, [a4], #1 ++ ldrb v8, [a3], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[4]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[4]}, [a4], v6 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d17[0]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d21[0]}, [ip] ++ vld1.32 {d19[0]}, [v8] ++ vld1.32 {d23[0]}, [lr] ++ ++ vld2.32 {d1[1], d3[1]}, [a3]! ++ vld2.32 {d5[1], d7[1]}, [a4]! ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb v8, [a3], #1 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[6]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[6]}, [a4], v6 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d17[1]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d21[1]}, [ip] ++ vld1.32 {d19[1]}, [v8] ++ vld1.32 {d23[1]}, [lr] ++ ++ @ So now we have: ++ @ q0.32[i] = curr[i].mv[0] ++ @ q1.32[i] = curr[i].mv[1] ++ @ q2.32[i] = neigh[i].mv[0] ++ @ q3.32[i] = neigh[i].mv[1] ++ @ q8.32[i] = curr_rpl0[curr[i].ref_idx[0]] ++ @ q9.32[i] = curr_rpl1[curr[i].ref_idx[1]] ++ @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]] ++ @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]] ++ @ d24.16[i] = curr[i].pred_flag ++ @ d25.16[i] = neigh[i].pred_flag ++ ++ vtst.16 d28, d24, d12 ++ vtst.16 d29, d24, d13 ++ vadd.i16 d8, d24, d12 ++ vadd.i16 d9, d25, d12 ++ vtst.16 d30, d25, d12 ++ vtst.16 d31, d25, d13 ++ veor d26, d8, d9 ++ ldr lr, [sp, 6*8 + 1*4] ++ vmovl.s16 q4, d28 ++ vmovl.s16 q5, d29 ++ teq lr, #1 ++ vmovl.s16 q14, d30 ++ it ne ++ lslne v1, lr, #1 ++ vmovl.s16 q15, d31 ++ it ne ++ rsbne v2, v1, #32 ++ vbif q0, q1, q4 ++ vbif q2, q3, q14 ++ vbif q1, q0, q5 ++ vbif q3, q2, q15 ++ vabd.s16 q12, q0, q2 ++ vabd.s16 q2, q1 ++ vabd.s16 q0, q3 ++ vabd.s16 q1, q3 ++ vbif q8, q9, q4 ++ vbif q10, q11, q14 ++ vbif q9, q8, q5 ++ vbif q11, q10, q15 ++ vclt.u16 d6, d24, d27 ++ vclt.u16 d8, d2, d27 ++ vclt.u16 d7, d25, d27 ++ vclt.u16 d9, d3, d27 ++ vclt.u16 d2, d0, d27 ++ vclt.u16 d0, d4, d27 ++ vclt.u16 d3, d1, d27 ++ vclt.u16 d1, d5, d27 ++ vceq.i32 q12, q10, q8 ++ vceq.i32 q10, q9 ++ vceq.i32 q8, q11 ++ vceq.i32 q9, q11 ++ vshrn.i32 d6, q3, #8 ++ vshrn.i32 d7, q4, #8 ++ vshrn.i32 d8, q1, #8 ++ vshrn.i32 d9, q0, #8 ++ vmovn.i32 d4, q12 ++ vmovn.i32 d2, q10 ++ vmovn.i32 d3, q8 ++ vmovn.i32 d5, q9 ++ vand q2, q3 ++ vrev16.8 q3, q3 ++ vand q2, q3 ++ vand q1, q4 ++ vrev16.8 q4, q4 ++ vand q1, q4 ++ vand d4, d5 ++ vand d2, d3 ++ vbic d0, d12, d4 ++ vshr.u16 d26, #2 ++ vbic d0, d2 ++ vmov.i16 d1, #0x5555 ++ vorr d0, d26 ++ bne 10f ++ ++ @ Merge results into result word, no duplicates ++ vmov a2, s0 ++ vmov v8, s1 ++ vmov.u16 ip, d0[1] ++ vmov.u16 lr, d0[3] ++ lsl a2, #30 ++ lsl v8, #30 ++ lsl ip, #30 ++ lsl lr, #30 ++ orr a2, ip, a2, lsr #2 ++ orr v8, lr, v8, lsr #2 ++ orr a2, v8, a2, lsr #4 ++ subs a1, #4 ++ orr v7, a2, v7, lsr #8 ++ bhi 1b ++ ++ mov a1, #32 ++ ldr a3, [sp, #6*8] ++ vpop {d8-d13} ++ sub a1, a1, a3, lsl #1 ++ mov a1, v7, lsr a1 ++ pop {a2-a4,v1-v8,pc} ++10: ++ @ Merge results into result word, with duplicates ++ vmul.i16 d0, d1 ++ vmov a2, s0 ++ vmov v8, s1 ++ vmov.u16 ip, d0[1] ++ vmov.u16 lr, d0[3] ++ lsl a2, v2 ++ subs a1, #4 ++ lsl v8, v2 ++ lsl ip, v2 ++ lsl lr, v2 ++ ldr v2, [sp, #6*8 + 12*4 + 1*4] ++T lsr a2, v1 ++T orr a2, ip, a2 ++A orr a2, ip, a2, lsr v1 ++ lsl ip, v1, #1 ++T lsr v8, v1 ++T orr v8, lr, v8 ++A orr v8, lr, v8, lsr v1 ++ lsl lr, v1, #2 ++T lsr a2, ip ++T orr a2, v8, a2 ++A orr a2, v8, a2, lsr ip ++ ldr v1, [sp, #6*8 + 12*4] ++T lsr v7, lr ++T orr v7, a2, v7 ++A orr v7, a2, v7, lsr lr ++ bhi 1b ++ ++ mov a1, #32 ++ ldrd a3, a4, [sp, #6*8] ++ vpop {d8-d13} ++ mls a1, a3, a4, a1 ++ mls a1, a3, a4, a1 ++ mov a1, v7, lsr a1 ++ pop {a2-a4,v1-v8,pc} ++ ++ ++2: ++ sub v5, v5, #10 ++ sub v6, v6, #10 ++ vmov.u8 d16, #0 ++ blo 3f ++ vld2.32 {d0[0], d1[0]}, [a3]! ++ vld2.32 {d2[0], d3[0]}, [a4]! ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb lr, [a4], #1 ++ ldrb v8, [a3], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d16[0]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d16[4]}, [a4], v6 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d4[0]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d5[0]}, [ip] ++ vld1.32 {d6[0]}, [v8] ++ vld1.32 {d7[0]}, [lr] ++ ++3: ++ vld2.32 {d0[1], d1[1]}, [a3]! ++ vld2.32 {d2[1], d3[1]}, [a4]! ++ ldrb a2, [a3], #1 ++ vmov.u16 d17, #1 ++ ldrb ip, [a4], #1 ++ vmov.u16 d18, #2 ++ ldrb v8, [a3], #1 ++ vmov.u16 d19, #4 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d16[2]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d16[6]}, [a4], v6 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d4[1]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d5[1]}, [ip] ++ vld1.32 {d6[1]}, [v8] ++ vld1.32 {d7[1]}, [lr] ++ ++ @ So now we have: ++ @ d0.32[i] = curr[i].mv[0] ++ @ d1.32[i] = curr[i].mv[1] ++ @ d2.32[i] = neigh[i].mv[0] ++ @ d3.32[i] = neigh[i].mv[1] ++ @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]] ++ @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]] ++ @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]] ++ @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]] ++ @ d16.16[i] = curr[i].pred_flag ++ @ d16.16[2+i] = neigh[i].pred_flag ++ ++ vtst.16 d20, d16, d17 ++ vtst.16 d22, d16, d18 ++ vadd.i16 d30, d16, d17 ++ vswp d2, d3 ++ ldr lr, [sp, #1*4] ++ vmovl.s16 q10, d20 ++ teq lr, #1 ++ vmovl.s16 q11, d22 ++ it ne ++ lslne v1, lr, #1 ++ vbif d0, d1, d20 ++ vbif d4, d6, d20 ++ vbif d3, d2, d21 ++ vbif d5, d7, d21 ++ vbif d1, d0, d22 ++ vbif d6, d4, d22 ++ vbif d2, d3, d23 ++ vbif d7, d5, d23 ++ vshr.u16 d30, #2 ++ vabd.s16 d24, d0, d3 ++ vabd.s16 d25, d1, d2 ++ vabd.s16 q0, q0, q1 ++ vceq.i32 d2, d4, d5 ++ vceq.i32 d20, d5, d6 ++ vceq.i32 d21, d4, d7 ++ vceq.i32 d3, d6, d7 ++ vclt.u16 d6, d24, d19 ++ vclt.u16 d7, d25, d19 ++ vclt.u16 d22, d1, d19 ++ vclt.u16 d23, d0, d19 ++ vshrn.i32 d6, q3, #8 ++ vmovn.i32 d2, q1 ++ vshrn.i32 d7, q11, #8 ++ vmovn.i32 d3, q10 ++ vand q0, q3, q1 ++ it ne ++ rsbne v2, v1, #32 ++ vrev16.8 q3, q3 ++ vand q0, q3 ++ vsra.u64 d30, #32 ++ vshr.u64 q1, q0, #32 ++ vand q0, q1 ++ vbic d0, d17, d0 ++ vand d30, d30, d17 ++ vbic d0, d1 ++ vmov.i16 d1, #0x5555 ++ vorr d0, d30 ++ bne 10f ++ ++ @ Construct result word, no duplicates ++ cmp a1, #2 ++ vmov.u16 a1, d0[1] ++ vmov.u16 a2, d0[0] ++ it eq ++ orreq a1, a2, a1, lsl #2 ++ pop {a2-a4,v1-v8,pc} ++10: ++ @ Construct result word, with duplicates ++ cmp a1, #2 ++ vmul.i16 d0, d1 ++ vmov a2, s0 ++ vmov.u16 a1, d0[1] ++ lsl a2, #16 ++ pkhbt a1, a1, a1, lsl #16 ++ lsr a2, v2 ++ lsr a1, v2 ++T itt eq ++T lsleq a1, v1 ++T orreq a1, a2, a1 ++A orreq a1, a2, a1, lsl v1 ++ pop {a2-a4,v1-v8,pc} ++endfunc ++ ++ ++ ++#else // non-NEON version ++ ++ ++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, ++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, ++ * int in_inc0, in_inc1) ++ */ ++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 ++ add ip, sp, #4*4 ++ push {a2-a4,v1-v8,lr} ++ mov v6, #32 ++1: ldmdb ip, {v1-v4} ++ ldrsb v5, [a3, #8] @ curr->ref_idx ++ ldrsb v8, [a3, #9] ++ ldrsb ip, [a4, #8] @ neigh->ref_idx ++ ldrsb lr, [a4, #9] ++ ldr v1, [v1, v5, lsl #2] ++ ldrb v5, [a3, #10] @ curr->pred_flag ++ ldr v2, [v2, v8, lsl #2] ++ ldrb v8, [a4, #10] @ neigh->pred_flag ++ ldr v3, [v3, ip, lsl #2] ++ ldr v4, [v4, lr, lsl #2] ++ teq v5, #3 ++ beq 20f ++ teq v8, #3 ++ beq 90f ++ ++ tst v5, #1 ++ itee ne ++ ldrne v5, [a3, #0] @ curr->mv[0] ++ moveq v1, v2 ++ ldreq v5, [a3, #4] @ curr->mv[1] ++ tst v8, #1 ++ itee ne ++ ldrne v8, [a4, #0] @ neigh->mv[0] ++ moveq v3, v4 ++ ldreq v8, [a4, #4] @ neigh->mv[1] ++ teq v1, v3 ++ bne 10f ++ ldr lr, =0xFFFCFFFC ++ ssub16 ip, v8, v5 ++ ssub16 v5, v5, v8 ++ sel v5, v5, ip ++ ands v5, v5, lr ++ @ drop through ++10: it ne ++ movne v5, #1<<30 ++11: ++ sub v6, v6, #2 ++T mov v7, v7, lsr #2 ++ subs a2, a2, #1 ++A orr v7, v5, v7, lsr #2 ++T orr v7, v5, v7 ++ bhi 11b ++ ++ ldrd v3, v4, [sp, #16*4] ++ ldr a2, [sp] ++ add ip, sp, #16*4 ++ subs a1, a1, #1 ++ add a3, a3, v3 ++ add a4, a4, v4 ++ bhi 1b ++ mov a1, v7, lsr v6 ++ pop {a2-a4,v1-v8,pc} ++ ++20: teq v8, #3 ++ bne 10b ++ ++ teq v1, v3 ++ it eq ++ teqeq v2, v4 ++ bne 40f ++ teq v1, v2 ++ bne 30f ++ ++ ldrd v1, v2, [a3] @ curr->mv ++ ldrd v3, v4, [a4] @ neigh->mv ++ ldr lr, =0xFFFCFFFC ++ ssub16 ip, v3, v1 ++ ssub16 v5, v1, v3 ++ sel v5, v5, ip ++ ands v5, v5, lr ++ bne 25f ++ ssub16 ip, v4, v2 ++ ssub16 v5, v2, v4 ++ sel v5, v5, ip ++ ands v5, v5, lr ++ beq 11b ++ @ drop through ++25: ssub16 ip, v4, v1 ++ ssub16 v5, v1, v4 ++ sel v5, v5, ip ++ ands v5, v5, lr ++ bne 10b ++ ssub16 ip, v3, v2 ++ ssub16 v5, v2, v3 ++ sel v5, v5, ip ++ ands v5, v5, lr ++ b 10b ++ ++30: ldrd v1, v2, [a3] @ curr->mv ++ ldrd v3, v4, [a4] @ neigh->mv ++ ldr lr, =0xFFFCFFFC ++ ssub16 ip, v3, v1 ++ ssub16 v5, v1, v3 ++ sel v5, v5, ip ++ ands v5, v5, lr ++ bne 10b ++ ssub16 ip, v4, v2 ++ ssub16 v5, v2, v4 ++ sel v5, v5, ip ++ ands v5, v5, lr ++ b 10b ++ ++40: teq v1, v4 ++ ite eq ++ teqeq v2, v3 ++ bne 10b ++ ++ ldrd v1, v2, [a3] @ curr->mv ++ ldrd v3, v4, [a4] @ neigh->mv ++ ldr lr, =0xFFFCFFFC ++ b 25b ++ ++90: ++ mov v5, #1<<30 ++ b 11b ++endfunc ++ ++ ++#endif ++ ++ ++@ ============================================================================= ++@ ++@ 10 bit ++ ++function hevc_loop_filter_luma_body_10 ++ m_filter_luma 10, q11, q15 ++endfunc ++ ++function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1 ++ hevc_loop_filter_luma_start ++ b .Lh_loop_luma_common_10 ++endfunc ++ ++function ff_hevc_rpi_h_loop_filter_luma2_neon_10, export=1 ++ cmp r3, #0 ++ it eq ++ bxeq lr ++ push {r4-r10,lr} @ 32 bytes ++ ldr r10, [sp, #32] ++.Lh_loop_luma_common_10: ++ m_filter_h_luma_16 10 ++endfunc ++ ++function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1 ++ hevc_loop_filter_luma_start ++ sub r4, r0, #8 ++ b .Lv_loop_luma_common_10 ++endfunc ++ ++function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1 ++ cmp r3, #0 ++ it eq ++ bxeq lr ++ push {r4-r10,lr} @ 32 bytes ++ ldr r4, [sp, #36] ++ ldr r10, [sp, #32] ++ ++.Lv_loop_luma_common_10: ++ m_filter_v_luma_16 10 ++endfunc ++ ++function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1 ++ m_filter_h_uv_16 10 ++endfunc ++ ++function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1 ++ m_filter_v_uv2_16 10 ++endfunc ++ +diff --git a/libavcodec/arm/rpi_hevcdsp_idct_neon.S b/libavcodec/arm/rpi_hevcdsp_idct_neon.S +new file mode 100644 +index 0000000000..7ed5c7dc52 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S +@@ -0,0 +1,184 @@ ++/* ++ * Copyright (c) 2014 Seppo Tomperi ++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++/* uses registers q8 - q13 for temp values */ ++.macro tr4_luma_shift shift ++ vaddl.s16 q8, d28, d30 // c0 = src0 + src2 ++ vaddl.s16 q9, d30, d31 // c1 = src2 + src3 ++ vsubl.s16 q10, d28, d31 // c2 = src0 - src3 ++ vaddl.s16 q11, d28, d31 // src0 + src3 ++ ++ vmul.i32 q12, q8, d1[0] // 29 * c0 ++ vmul.i32 q13, q10, d2[0] // 55 * c2 ++ vmul.i32 q8, q8, d2[0] // 55 * c0 ++ vmull.s16 q14, d29, d0[0] // c3 = 74 * src1 ++ ++ vsubw.s16 q11, q11, d30 // src0 - src2 + src3 ++ vmla.i32 q12, q9, d2[0] // 29 * c0 + 55 * c1 ++ vmls.i32 q13, q9, d1[0] // 55 * c2 - 29 * c1 ++ vmla.i32 q8, q10, d1[0] // 55 * c0 + 29 * c2 ++ ++ vmul.i32 q11, q11, d0[0] // dst2 = 74 * (src0 - src2 + src3) ++ vadd.i32 q12, q12, q14 // dst0 = 29 * c0 + 55 * c1 + c3 ++ vadd.i32 q13, q13, q14 // dst1 = 55 * c2 - 29 * c1 + c3 ++ vsub.i32 q8, q8, q14 // dst3 = 55 * c0 + 29 * c2 - c3 ++ ++ vqrshrn.s32 d28, q12, \shift ++ vqrshrn.s32 d29, q13, \shift ++ vqrshrn.s32 d30, q11, \shift ++ vqrshrn.s32 d31, q8, \shift ++.endm ++ ++/* uses registers q8 - q11 for temp values */ ++.macro tr4_shift shift ++ vmull.s16 q9, d29, d0[0] // 83 * src1 ++ vmull.s16 q8, d29, d0[1] // 36 * src1 ++ vshll.s16 q14, d28, #6 // 64 * src0 ++ vshll.s16 q10, d30, #6 // 64 * src2 ++ vmlal.s16 q9, d31, d0[1] // 83 * src1 + 36 * src3 o0 ++ vmlsl.s16 q8, d31, d0[0] // 36 * src1 - 83 * src3 o1 ++ vadd.s32 q11, q14, q10 // 64 * (src0 + src2) e0 ++ vsub.s32 q10, q14, q10 // 64 * (src0 - src2) e1 ++ vadd.s32 q14, q11, q9 // e0 + o0 ++ vadd.s32 q15, q10, q8 // e1 + o1 ++ vsub.s32 q8, q10, q8 // e1 - o1 ++ vsub.s32 q9, q11, q9 // e0 - o0 ++ ++ vqrshrn.s32 d28, q14, \shift ++ vqrshrn.s32 d29, q15, \shift ++ vqrshrn.s32 d30, q8, \shift ++ vqrshrn.s32 d31, q9, \shift ++.endm ++ ++.macro tr8_process d0, d1, d2, d3, d4, d5, d6, d7, \ ++ tmp0, /* Q reg which doesn't alias with d4, d6 or d7 */ \ ++ tmp1, /* Q reg which doesn't alias with d7 or d0 */ \ ++ shift, I1, I2, I3 ++ ++ vmull.s16 q4, \d1, d1[1] // 89 * src1 ++ \I1 ++ vmull.s16 q5, \d1, d1[0] // 75 * src1 ++ \I2 ++ vmull.s16 q6, \d1, d1[3] // 50 * src1 ++ \I3 ++ vmull.s16 q7, \d1, d1[2] // 18 * src1 ++ vmlal.s16 q4, \d3, d1[0] // 75 * src3 ++ vmlsl.s16 q5, \d3, d1[2] //-18 * src3 ++ vmlsl.s16 q6, \d3, d1[1] //-89 * src3 ++ vmlsl.s16 q7, \d3, d1[3] //-50 * src3 ++ ++ // tr4 ++ vmull.s16 q1, \d2, d0[0] // 83 * src(1*2) ++ vmull.s16 q2, \d2, d0[1] // 36 * src(1*2) ++ ++ vmlal.s16 q4, \d5, d1[3] // 50 * src5 ++ vmlsl.s16 q5, \d5, d1[1] //-89 * src5 ++ vmlal.s16 q6, \d5, d1[2] // 18 * src5 ++ vmlal.s16 q7, \d5, d1[0] // 75 * src5 ++ ++ vshll.s16 q3, \d0, #6 // 64 * src(0*2) ++ vshll.s16 \tmp0, \d4, #6 // 64 * src(2*2) ++ vmlal.s16 q1, \d6, d0[1] // 83 * src(1*2) + 36 * src(3*2) o0 ++ vmlsl.s16 q2, \d6, d0[0] // 36 * src(1*2) - 83 * src(3*2) o1 ++ vadd.i32 \tmp1, q3, \tmp0 // 64 * (src(0*2) + src(2*2)) e0 ++ vsub.i32 \tmp0, q3, \tmp0 // 64 * (src(0*2) - src(2*2)) e1 ++ ++ vmlal.s16 q4, \d7, d1[2] // 18 * src7 ++ vmlsl.s16 q5, \d7, d1[3] //-50 * src7 ++ vmlal.s16 q6, \d7, d1[0] // 75 * src7 ++ vmlsl.s16 q7, \d7, d1[1] //-89 * src7 ++ ++ vsub.i32 q3, \tmp1, q1 // e0 - o0 ++ vadd.i32 \tmp1, \tmp1, q1 // e0 + o0 ++ vadd.i32 q1, \tmp0, q2 // e1 + o1 ++ vsub.i32 q2, \tmp0, q2 // e1 - o1 ++ ++ vadd.i32 \tmp0, \tmp1, q4 // e_8[0] + o_8[0], dst[0] ++ vsub.i32 q4, \tmp1, q4 // e_8[0] - o_8[0], dst[7] ++ vsub.i32 \tmp1, q3, q7 // e_8[3] - o_8[3], dst[4] ++ vadd.i32 q7, q3, q7 // e_8[3] + o_8[3], dst[3] ++ vadd.i32 q3, q1, q5 // e_8[1] + o_8[1], dst[1] ++ vsub.i32 q5, q1, q5 // e_8[1] - o_8[1], dst[6] ++ vsub.i32 q1, q2, q6 // e_8[2] - o_8[2], dst[5] ++ vadd.i32 q6, q2, q6 // e_8[2] + o_8[2], dst[2] ++ vqrshrn.s32 \d0, \tmp0, #\shift ++ vqrshrn.s32 \d4, \tmp1, #\shift ++ vqrshrn.s32 \d1, q3, #\shift ++ vqrshrn.s32 \d5, q1, #\shift ++ vqrshrn.s32 \d2, q6, #\shift ++ vqrshrn.s32 \d6, q5, #\shift ++ vqrshrn.s32 \d3, q7, #\shift ++ vqrshrn.s32 \d7, q4, #\shift ++.endm ++ ++.macro tr8_vert d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, I1, I2, I3 ++ vld1.16 {\d0}, [r0 :64], r3 ++ vld1.16 {\d1}, [r2 :64], r3 ++ vld1.16 {\d2}, [r0 :64], r3 ++ vld1.16 {\d3}, [r2 :64], r3 ++ vld1.16 {\d4}, [r0 :64], r3 ++ vld1.16 {\d5}, [r2 :64], r3 ++ vld1.16 {\d6}, [r0 :64], r3 ++ vld1.16 {\d7}, [r2 :64], r3 ++ ++ tr8_process \ ++ \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \ ++ \q01, \q23, 7, "\I1", "\I2", "\I3" ++.endm ++ ++.macro tr8_horiz d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, shift ++ tr8_process \ ++ \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \ ++ \q01, \q23, \shift ++ ++ vzip.16 \d0, \d4 ++ vzip.16 \d1, \d5 ++ vzip.16 \d2, \d6 ++ vzip.16 \d3, \d7 ++ vst4.16 {\d0-\d3}, [r0 :128], r3 ++ vst4.16 {\d4-\d7}, [r2 :128], r3 ++.endm ++ ++#define BIT_DEPTH 8 ++#include "rpi_hevc_idct_fn_neon.S" ++ ++.text ++ ++.align 4 ++tr4f: ++.word 0x00240053 // 36 and d1[0] = 83 ++.word 0x00000000 ++tr8f: ++.word 0x0059004b // 89, d0[0] = 75 ++.word 0x00320012 // 50, d0[2] = 18 ++tr16: ++.word 0x005a0057 // 90, d2[0] = 87 ++.word 0x00500046 // 80, d2[2] = 70 ++.word 0x0039002b // 57, d2[0] = 43 ++.word 0x00190009 // 25, d2[2] = 9 ++ ++#undef BIT_DEPTH ++#define BIT_DEPTH 10 ++#include "rpi_hevc_idct_fn_neon.S" ++ +diff --git a/libavcodec/arm/rpi_hevcdsp_init_arm.c b/libavcodec/arm/rpi_hevcdsp_init_arm.c +new file mode 100644 +index 0000000000..109fa98c29 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c +@@ -0,0 +1,32 @@ ++/* ++ * Copyright (c) 2014 Seppo Tomperi ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/attributes.h" ++#include "libavutil/arm/cpu.h" ++#include "libavcodec/rpi_hevcdsp.h" ++#include "rpi_hevcdsp_arm.h" ++ ++av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth) ++{ ++ int cpu_flags = av_get_cpu_flags(); ++ ++ if (have_neon(cpu_flags)) ++ ff_hevcdsp_rpi_init_neon(c, bit_depth); ++} +diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c +new file mode 100644 +index 0000000000..9294ab8010 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c +@@ -0,0 +1,467 @@ ++/* ++ * Copyright (c) 2014 Seppo Tomperi ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "config.h" ++#include "libavutil/attributes.h" ++#include "libavutil/arm/cpu.h" ++#include "libavcodec/rpi_hevcdsp.h" ++#include "rpi_hevcdsp_arm.h" ++#include "libavcodec/avcodec.h" ++#include "libavcodec/bit_depth_template.c" ++ ++// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but ++// have been removed from head as we never use them. ++ ++void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++ ++void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++ ++void ff_hevc_rpi_h_loop_filter_luma2_neon_8(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f); ++void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f, ++ uint8_t * _pix_l); ++void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4, ++ unsigned int no_f); ++void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f); ++ ++void ff_hevc_rpi_h_loop_filter_luma2_neon_10(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f); ++void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f, ++ uint8_t * _pix_l); ++void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4, ++ unsigned int no_f); ++void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f); ++ ++void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit); ++void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit); ++void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs); ++void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs); ++void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs); ++void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs); ++void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs); ++ ++void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit); ++void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit); ++void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs); ++void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs); ++void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs); ++void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs); ++void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs); ++ ++void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++ ++void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++ ++ ++void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++ ++void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++ ++ ++void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++ ++ ++void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++ ++void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++ ++void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++ ++void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++ ++void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++ ++void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++ ++void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++ ++void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++ ++void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++ ++ ++uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh, ++ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, ++ int in_inc0, int in_inc1); ++void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height); ++ ++ ++static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++{ ++ ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); ++ ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height); ++} ++static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++{ ++ ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); ++ ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height); ++} ++ ++static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); ++ ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++} ++static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); ++ ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++} ++ ++#if SAO_FILTER_N == 6 ++static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++{ ++ ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); ++ ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height); ++} ++static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++{ ++ ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); ++ ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height); ++} ++ ++static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++ ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); ++} ++static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++ ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); ++} ++ ++static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height) ++{ ++ ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); ++ ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); ++} ++static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height) ++{ ++ ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); ++ ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); ++} ++ ++static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) ++{ ++ ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height); ++ ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height); ++} ++static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) ++{ ++ ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height); ++ ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height); ++} ++#endif ++ ++ ++ ++#if RPI_HEVC_SAO_BUF_STRIDE != 160 ++#error SAO edge src stride not 160 - value used in .S ++#endif ++ ++av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth) ++{ ++ if (bit_depth == 8) { ++ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_8; ++ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_8; ++ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_8; ++ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_8; ++ c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_8; ++ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_8; ++ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_8; ++ c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_8; ++ c->idct[0] = ff_hevc_rpi_transform_4x4_neon_8; ++ c->idct[1] = ff_hevc_rpi_transform_8x8_neon_8; ++ c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_8; ++ c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_8; ++ c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_8; ++ c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_8; ++ c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_8; ++ c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_8; ++ c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_8; ++ c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_8; ++ c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_8; ++ c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_8; ++ c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_8; ++ c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_8; ++ c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_8; ++ c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_8; ++ c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_8; ++ c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_8; ++ c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_8; ++ c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_8; ++ c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_8; ++ c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_8; ++ c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_8; ++ c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8; ++ c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8; ++ c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8; ++ c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_8; ++ c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_8; ++ c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_8; ++ c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_8; ++ c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_8; ++ c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_8; ++ c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_8; ++ c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_8; ++ c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_8; ++ c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_8; ++ c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_8; ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_8; ++ c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_8; ++#endif ++ c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_8; ++ c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_8; ++ c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_8; ++ ++ c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_8; ++ c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_8; ++ c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_8; ++ ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_8; ++ c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_8; ++#endif ++ } ++ else if (bit_depth == 10) { ++ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_10; ++ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_10; ++ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_10; ++ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_10; ++ c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_10; ++ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_10; ++ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_10; ++ c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_10; ++ c->idct[0] = ff_hevc_rpi_transform_4x4_neon_10; ++ c->idct[1] = ff_hevc_rpi_transform_8x8_neon_10; ++ c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_10; ++ c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_10; ++ c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_10; ++ c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_10; ++ c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_10; ++ c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_10; ++ c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_10; ++ c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_10; ++ c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_10; ++ c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_10; ++ c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_10; ++ c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_10; ++ c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_10; ++ c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_10; ++ c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_10; ++ c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_10; ++ c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_10; ++ c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_10; ++ c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_10; ++ c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_10; ++ c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_10; ++ c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10; ++ c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10; ++ c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10; ++ c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_10; ++ c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_10; ++ c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_10; ++ c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_10; ++ c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_10; ++ c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_10; ++ ++ c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_10; ++ c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_10; ++ c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_10; ++ c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_10; ++ c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_10; ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_10; ++ c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_10; ++#endif ++ c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_10; ++ c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_10; ++ c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_10; ++ ++ c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_10; ++ c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_10; ++ c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_10; ++ ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_10; ++ c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_10; ++#endif ++ } ++ ++ assert(offsetof(HEVCRpiMvField, mv) == 0); ++ assert(offsetof(HEVCRpiMvField, ref_idx) == 8); ++ assert(offsetof(HEVCRpiMvField, pred_flag) == 10); ++ c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon; ++ c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon; ++} +diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S +new file mode 100644 +index 0000000000..93876d14c0 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S +@@ -0,0 +1,620 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++ .arch_extension mp @ enable PLDW ++ ++#define BIT_DEPTH 10 ++ ++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX ++ vmax.s16 \Q0, \Q_MIN ++ vmax.s16 \Q1, \Q_MIN ++ vmax.s16 \Q2, \Q_MIN ++ vmax.s16 \Q3, \Q_MIN ++ vmin.s16 \Q0, \Q_MAX ++ vmin.s16 \Q1, \Q_MAX ++ vmin.s16 \Q2, \Q_MAX ++ vmin.s16 \Q3, \Q_MAX ++.endm ++ ++@ add_residual4x4( ++@ uint16_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1 ++ add ip, r0, r2 ++ vld1.16 {q10, q11}, [r1] ++ lsl r2, #1 ++ vld1.16 {d0}, [r0 :64], r2 ++ vld1.16 {d1}, [ip :64], r2 ++ vld1.16 {d2}, [r0 :64] ++ vld1.16 {d3}, [ip :64] ++ sub r0, r2 ++ vqadd.s16 q0, q10 ++ sub ip, r2 ++ vqadd.s16 q1, q11 ++ vmov.i16 q8, #0 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vst1.16 {d0}, [r0 :64], r2 ++ vst1.16 {d1}, [ip :64], r2 ++ vst1.16 {d2}, [r0 :64] ++ vst1.16 {d3}, [ip :64] ++ bx lr ++ ++endfunc ++ ++@ add_residual4x4_dc( ++@ uint16_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1 ++ add ip, r0, r1 ++ vdup.16 q15, r2 ++ lsl r1, #1 ++ vld1.16 {d0}, [r0 :64], r1 ++ vld1.16 {d1}, [ip :64], r1 ++ vld1.16 {d2}, [r0 :64] ++ vld1.16 {d3}, [ip :64] ++ sub r0, r1 ++ vqadd.s16 q0, q15 ++ sub ip, r1 ++ vqadd.s16 q1, q15 ++ vmov.i16 q8, #0 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vst1.16 {d0}, [r0 :64], r1 ++ vst1.16 {d1}, [ip :64], r1 ++ vst1.16 {d2}, [r0 :64] ++ vst1.16 {d3}, [ip :64] ++ bx lr ++ ++endfunc ++ ++ ++@ add_residual8x8( ++@ uint16_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1 ++ mov r3, #8 ++ vmov.i64 q8, #0 ++ add ip, r0, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ lsl r2, #1 ++1: ++ vldm r1!, {q10-q13} ++ vld1.16 {q0}, [r0 :128], r2 ++ vld1.16 {q1}, [ip :128], r2 ++ vld1.16 {q2}, [r0 :128] ++ vld1.16 {q3}, [ip :128] ++ sub r0, r2 ++ vqadd.s16 q0, q10 ++ sub ip, r2 ++ vqadd.s16 q1, q11 ++ subs r3, #4 ++ vqadd.s16 q2, q12 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0}, [r0 :128], r2 ++ vst1.16 {q1}, [ip :128], r2 ++ vst1.16 {q2}, [r0 :128], r2 ++ vst1.16 {q3}, [ip :128], r2 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ add_residual4x4_dc_c( ++@ uint16_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r3, #4 ++ vdup.32 q15, r2 ++ b 9f ++endfunc ++ ++@ add_residual8x8_dc( ++@ uint16_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r2 ++ mov r3, #8 ++9: ++ vmov.i16 q8, #0 ++ add ip, r0, r1 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ lsl r1, #1 ++1: ++ vld1.16 {q0}, [r0 :128], r1 ++ vld1.16 {q1}, [ip :128], r1 ++ vld1.16 {q2}, [r0 :128] ++ vld1.16 {q3}, [ip :128] ++ sub r0, r1 ++ vqadd.s16 q0, q15 ++ sub ip, r1 ++ vqadd.s16 q1, q15 ++ subs r3, #4 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0}, [r0 :128], r1 ++ vst1.16 {q1}, [ip :128], r1 ++ vst1.16 {q2}, [r0 :128], r1 ++ vst1.16 {q3}, [ip :128], r1 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ add_residual16x16( ++@ uint16_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1 ++ add ip, r0, r2 ++ vmov.i16 q8, #0 ++ lsl r2, #1 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ mov r3, #16 ++1: ++ vldm r1!, {q10-q13} ++ @ For RPI Sand we could guarantee :256 but not for general ++ @ non-RPI allocation. :128 is as good as we can claim ++ vld1.16 {q0, q1}, [r0 :128] ++ subs r3, #2 ++ vld1.16 {q2, q3}, [ip :128] ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqadd.s16 q2, q12 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0, q1}, [r0 :128], r2 ++ vst1.16 {q2, q3}, [ip :128], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual8x8_dc_c( ++@ uint16_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r3, #8 ++ vdup.32 q15, r2 ++ b 9f ++endfunc ++ ++@ add_residual16x16_dc( ++@ uint16_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1 ++ vdup.i16 q15, r2 ++ mov r3, #16 ++9: ++ vmov.i16 q8, #0 ++ add ip, r0, r1 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ lsl r1, #1 ++1: ++ @ For RPI Sand we could guarantee :256 but not for general ++ @ non-RPI allocation. :128 is as good as we can claim ++ vld1.16 {q0, q1}, [r0 :128] ++ subs r3, #2 ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q15 ++ vld1.16 {q2, q3}, [ip :128] ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0, q1}, [r0 :128], r1 ++ vst1.16 {q2, q3}, [ip :128], r1 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++ ++@ add_residual32x32( ++@ uint16_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1 ++ push {lr} ++ mov r3, #32 ++ vmov.i16 q8, #0 ++ add lr, r0, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ add ip, r0, #32 ++1: ++ vldm r1!, {q10-q13} ++ vldm r0, {q0-q3} ++ vqadd.s16 q0, q10 ++ pldw [lr] ++ vqadd.s16 q1, q11 ++ add lr, r2 ++ vqadd.s16 q2, q12 ++ subs r3, #1 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0-q1}, [r0], r2 ++ vst1.16 {q2-q3}, [ip], r2 ++ bne 1b ++ pop {pc} ++ ++endfunc ++ ++@ add_residual16x16_dc_c( ++@ uint16_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r3, #16 ++ vdup.32 q15, r2 ++ b 9f ++endfunc ++ ++@ add_residual32x32_dc( ++@ uint16_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r2 ++ mov r3, #32 ++9: ++ vmov.i16 q8, #0 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ add ip, r0, #32 ++1: ++ vldm r0, {q0-q3} ++ vqadd.s16 q0, q15 ++ subs r3, #1 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0-q1}, [r0], r1 ++ vst1.16 {q2-q3}, [ip], r1 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ ============================================================================ ++@ U add ++ ++@ add_residual4x4_u( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ add ip, r0, r2 ++ vld1.16 {q10, q11}, [r1 :256] ++ lsl r2, #1 ++ vld2.16 {d0, d2}, [r0 :128], r2 ++ vld2.16 {d1, d3}, [ip :128], r2 ++ vld2.16 {d4, d6}, [r0 :128] ++ vld2.16 {d5, d7}, [ip :128] ++ sub r0, r2 ++ vmov.i16 q8, #0 ++ sub ip, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ ++ vst2.16 {d0, d2}, [r0 :128], r2 ++ vst2.16 {d1, d3}, [ip :128], r2 ++ vst2.16 {d4, d6}, [r0 :128] ++ vst2.16 {d5, d7}, [ip :128] ++ bx lr ++endfunc ++ ++@ add_residual8x8_u( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ mov r3, #8 ++ vmov.i16 q8, #0 ++ add ip, r0, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ lsl r2, #1 ++1: ++ vld2.16 {q0, q1}, [r0 :256] ++ subs r3, #2 ++ vld2.16 {q2, q3}, [ip :256] ++ vld1.16 {q10, q11}, [r1 :256]! ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0 :256], r2 ++ vst2.16 {q2, q3}, [ip :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_u( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1 ++ push {lr} ++ vdup.16 q15, r3 ++ mov r3, #16 ++ vmov.i16 q8, #0 ++ add lr, r0, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ add ip, r0, #32 ++1: ++ vld2.16 {q0, q1}, [r0 :256] ++ vld2.16 {q2, q3}, [ip :256] ++ vld1.16 {q10, q11}, [r1 :256]! ++ vqadd.s16 q0, q10 ++ pldw [lr] ++ vqadd.s16 q1, q15 ++ add lr, r2 ++ vqadd.s16 q2, q11 ++ subs r3, #1 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0 :256], r2 ++ vst2.16 {q2, q3}, [ip :256], r2 ++ bne 1b ++ pop {pc} ++endfunc ++ ++@ ============================================================================ ++@ V add ++ ++@ add_residual4x4_v( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ add ip, r0, r2 ++ vld1.16 {q10, q11}, [r1 :256] ++ lsl r2, #1 ++ vld2.16 {d0, d2}, [r0 :128], r2 ++ vld2.16 {d1, d3}, [ip :128], r2 ++ vld2.16 {d4, d6}, [r0 :128] ++ vld2.16 {d5, d7}, [ip :128] ++ sub r0, r2 ++ vmov.i16 q8, #0 ++ sub ip, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q10 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q11 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ ++ vst2.16 {d0, d2}, [r0 :128], r2 ++ vst2.16 {d1, d3}, [ip :128], r2 ++ vst2.16 {d4, d6}, [r0 :128] ++ vst2.16 {d5, d7}, [ip :128] ++ bx lr ++endfunc ++ ++@ add_residual8x8_v( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ mov r3, #8 ++ vmov.i16 q8, #0 ++ add ip, r0, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ lsl r2, #1 ++1: ++ vld2.16 {q0, q1}, [r0 :256] ++ subs r3, #2 ++ vld2.16 {q2, q3}, [ip :256] ++ vld1.16 {q10, q11}, [r1 :256]! ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q10 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q11 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0 :256], r2 ++ vst2.16 {q2, q3}, [ip :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_v( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1 ++ push {lr} ++ vdup.16 q15, r3 ++ mov r3, #16 ++ vmov.i16 q8, #0 ++ add lr, r0, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ add ip, r0, #32 ++1: ++ vld2.16 {q0, q1}, [r0 :256] ++ vld2.16 {q2, q3}, [ip :256] ++ vld1.16 {q10, q11}, [r1 :256]! ++ vqadd.s16 q0, q15 ++ pldw [lr] ++ vqadd.s16 q1, q10 ++ add lr, r2 ++ vqadd.s16 q2, q15 ++ subs r3, #1 ++ vqadd.s16 q3, q11 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0 :256], r2 ++ vst2.16 {q2, q3}, [ip :256], r2 ++ bne 1b ++ pop {pc} ++endfunc ++ ++@ ============================================================================ ++@ U & V add ++ ++@ add_residual4x4_c( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1 ++ vmov.i16 q8, #0 ++ add ip, r0, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ lsl r2, #1 ++ vldm r1, {q10-q13} ++ vld2.16 {d0, d2}, [r0 :128], r2 ++ vld2.16 {d1, d3}, [ip :128], r2 ++ vld2.16 {d4, d6}, [r0 :128] ++ vld2.16 {d5, d7}, [ip :128] ++ ++ sub r0, r2 ++ vqadd.s16 q0, q10 ++ sub ip, r2 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ ++ vst2.16 {d0, d2}, [r0 :128], r2 ++ vst2.16 {d1, d3}, [ip :128], r2 ++ vst2.16 {d4, d6}, [r0 :128] ++ vst2.16 {d5, d7}, [ip :128] ++ bx lr ++endfunc ++ ++@ add_residual8x8_c( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1 ++ push {lr} ++ add ip, r0, r2 ++ lsl r2, #1 ++ vmov.i16 q8, #0 ++ add r3, r1, #(8*8*2) @ Offset to V ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ mov lr, #8 ++1: ++ vld1.16 {q10, q11}, [r1 :256]! ++ subs lr, #2 ++ vld2.16 {q0, q1}, [r0 :256] ++ vld2.16 {q2, q3}, [ip :256] ++ vld1.16 {q12, q13}, [r3 :256]! ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0 :256], r2 ++ vst2.16 {q2, q3}, [ip :256], r2 ++ bne 1b ++ pop {pc} ++endfunc ++ ++@ add_residual16x16_c( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1 ++ push {r4, lr} ++ vmov.i16 q8, #0 ++ add r3, r1, #(16*16*2) @ Offset to V ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ add ip, r0, #32 ++ add r4, r0, r2 ++ mov lr, #16 ++1: ++ vld2.16 {q0, q1}, [r0 :256] ++ vld2.16 {q2, q3}, [ip :256] ++ vld1.16 {q10, q11}, [r1 :256]! ++ vld1.16 {q12, q13}, [r3 :256]! ++ vqadd.s16 q0, q10 ++ pldw [r4] ++ vqadd.s16 q1, q12 ++ add r4, r2 ++ vqadd.s16 q2, q11 ++ subs lr, #1 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0 :256], r2 ++ vst2.16 {q2, q3}, [ip :256], r2 ++ bne 1b ++ pop {r4,pc} ++endfunc ++ +diff --git a/libavcodec/arm/rpi_hevcdsp_res8_neon.S b/libavcodec/arm/rpi_hevcdsp_res8_neon.S +new file mode 100644 +index 0000000000..d9a1d7d98c +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_res8_neon.S +@@ -0,0 +1,741 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++ .arch_extension mp @ enable PLDW ++ ++@ General notes: ++@ ++@ Residual is generally only guaranteed to be clipped to 16 bits. ++@ This means that we do need to do vmovl, vqadd, vqmovun ++@ rather than vaddw, vqmovun (if we were clipped to 15 then we could get away ++@ with this). ++@ ++@ There is an exception for the DC case because its transform is guaranteed ++@ to be small enough that overflow cannot occur during the first add. ++ ++@ ============================================================================ ++@ Y add ++ ++function ff_hevc_rpi_add_residual_4x4_neon_8, export=1 ++ add ip, r0, r2 ++ vld1.16 {q0, q1}, [r1] ++ lsl r2, #1 ++ vld1.32 d4[0], [r0], r2 ++ rsb r3, r2, #0 ++ vld1.32 d4[1], [ip], r2 ++ vld1.32 d5[0], [r0], r3 ++ vld1.32 d5[1], [ip], r3 ++ vmovl.u8 q8, d4 ++ vmovl.u8 q9, d5 ++ vqadd.s16 q0, q8 ++ vqadd.s16 q1, q9 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vst1.32 d0[0], [r0], r2 ++ vst1.32 d0[1], [ip], r2 ++ vst1.32 d1[0], [r0] ++ vst1.32 d1[1], [ip] ++ bx lr ++endfunc ++ ++function ff_hevc_rpi_add_residual_8x8_neon_8, export=1 ++ push {r4, lr} ++ vld1.16 {q0, q1}, [r1]! ++ add ip, r0, r2 ++ vld1.8 {d6}, [r0] ++ add r4, r0, r2, lsl #1 ++ vld1.8 {d7}, [ip] ++ add lr, ip, r2, lsl #1 ++ lsl r2, #1 ++ mov r3, #8-2 ++ vmovl.u8 q2, d6 ++ vmovl.u8 q3, d7 ++ vqadd.s16 q2, q0 ++ vqadd.s16 q3, q1 ++1: ++ vld1.16 {q0, q1}, [r1]! ++ subs r3, #2 ++ vqmovun.s16 d4, q2 ++ vqmovun.s16 d5, q3 ++ vld1.8 {d6}, [r4], r2 ++ vld1.8 {d7}, [lr], r2 ++ vst1.8 {d4}, [r0], r2 ++ vst1.8 {d5}, [ip], r2 ++ vmovl.u8 q2, d6 ++ pldw [r4] ++ vmovl.u8 q3, d7 ++ vqadd.s16 q2, q0 ++ vqadd.s16 q3, q1 ++ bne 1b ++ ++ vqmovun.s16 d4, q2 ++ vqmovun.s16 d5, q3 ++ vst1.8 {d4}, [r0] ++ vst1.8 {d5}, [ip] ++ pop {r4, pc} ++endfunc ++ ++function ff_hevc_rpi_add_residual_16x16_neon_8, export=1 ++ vld1.16 {q0, q1}, [r1]! ++ add ip, r0, r2 ++ vld1.8 {q3}, [r0] ++ mov r3, #16-1 ++ vmovl.u8 q2, d6 ++ vmovl.u8 q3, d7 ++ vqadd.s16 q2, q0 ++ vqadd.s16 q3, q1 ++1: ++ vld1.16 {q0, q1}, [r1]! ++ subs r3, #1 ++ vqmovun.s16 d4, q2 ++ vqmovun.s16 d5, q3 ++ vld1.8 {q3}, [ip], r2 ++ vst1.8 {q2}, [r0], r2 ++ vmovl.u8 q2, d6 ++ pldw [ip] ++ vmovl.u8 q3, d7 ++ vqadd.s16 q2, q0 ++ vqadd.s16 q3, q1 ++ bne 1b ++ ++ vqmovun.s16 d4, q2 ++ vqmovun.s16 d5, q3 ++ vst1.8 {q2}, [r0] ++ bx lr ++endfunc ++ ++function ff_hevc_rpi_add_residual_32x32_neon_8, export=1 ++ vldm r1!, {q0-q3} ++ vld1.8 {q8, q9}, [r0] ++ add ip, r0, r2 ++ vmovl.u8 q10, d16 ++ mov r3, #32-1 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vqadd.s16 q10, q0 ++ vqadd.s16 q11, q1 ++ vqadd.s16 q12, q2 ++ vqadd.s16 q13, q3 ++1: ++ vldm r1!, {q0-q3} ++ vqmovun.s16 d20, q10 ++ vqmovun.s16 d21, q11 ++ vqmovun.s16 d22, q12 ++ vqmovun.s16 d23, q13 ++ vld1.8 {q8, q9}, [ip], r2 ++ subs r3, #1 ++ vst1.8 {q10, q11}, [r0], r2 ++ vmovl.u8 q10, d16 ++ pldw [ip] ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vqadd.s16 q10, q0 ++ vqadd.s16 q11, q1 ++ vqadd.s16 q12, q2 ++ vqadd.s16 q13, q3 ++ bne 1b ++ ++ vqmovun.s16 d20, q10 ++ vqmovun.s16 d21, q11 ++ vqmovun.s16 d22, q12 ++ vqmovun.s16 d23, q13 ++ vst1.8 {q10, q11}, [r0] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_add_residual_4x4_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1 ++ add ip, r0, r1 ++ vdup.16 q15, r2 ++ lsl r1, #1 ++ vld1.32 d4[0], [r0], r1 ++ rsb r3, r1, #0 ++ vld1.32 d4[1], [ip], r1 ++ vld1.32 d5[0], [r0], r3 ++ vld1.32 d5[1], [ip], r3 ++ vaddw.u8 q0, q15, d4 ++ vaddw.u8 q1, q15, d5 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vst1.32 d0[0], [r0], r1 ++ vst1.32 d0[1], [ip], r1 ++ vst1.32 d1[0], [r0] ++ vst1.32 d1[1], [ip] ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ DC Y or C add ++ ++@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1 ++ mov r3, #4-2 ++ vdup.32 q15, r2 ++ b 1f ++endfunc ++ ++@ ff_hevc_rpi_add_residual_8x8_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1 ++ vdup.16 q15, r2 ++ mov r3, #8-2 ++1: vld1.8 d16, [r0] ++ add ip, r0, r1 ++ push {r4, lr} ++ vld1.8 d17, [ip] ++ add r4, r0, r1, lsl #1 ++ vaddw.u8 q0, q15, d16 ++ lsl r1, #1 ++ vaddw.u8 q1, q15, d17 ++ add lr, ip, r1 ++1: ++ vld1.8 {d16}, [r4], r1 ++ vld1.8 {d17}, [lr], r1 ++ subs r3, #2 ++ vqmovun.s16 d4, q0 ++ vqmovun.s16 d5, q1 ++ vaddw.u8 q0, q15, d16 ++ vaddw.u8 q1, q15, d17 ++ vst1.8 {d4}, [r0], r1 ++ vst1.8 {d5}, [ip], r1 ++ bne 1b ++ ++ vqmovun.s16 d4, q0 ++ vqmovun.s16 d5, q1 ++ vst1.8 {d4}, [r0] ++ vst1.8 {d5}, [ip] ++ pop {r4, pc} ++endfunc ++ ++ ++@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1 ++ mov r3, #8-1 ++ vdup.32 q15, r2 ++ b 1f ++endfunc ++ ++@ ff_hevc_rpi_add_residual_16x16_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1 ++ vdup.16 q15, r2 ++ mov r3, #16-1 ++1: vld1.8 {q8}, [r0] ++ add ip, r0, r1 ++ vaddw.u8 q0, q15, d16 ++ vaddw.u8 q1, q15, d17 ++1: ++ vld1.8 {q8}, [ip], r1 ++ subs r3, #1 ++ vqmovun.s16 d4, q0 ++ vqmovun.s16 d5, q1 ++ vaddw.u8 q0, q15, d16 ++ vaddw.u8 q1, q15, d17 ++ vst1.8 {q2}, [r0], r1 ++ bne 1b ++ ++ vqmovun.s16 d4, q0 ++ vqmovun.s16 d5, q1 ++ vst1.8 {q2}, [r0] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1 ++ mov r3, #16-1 ++ vdup.32 q15, r2 ++ b 1f ++endfunc ++ ++@ ff_hevc_rpi_add_residual_32x32_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1 ++ vdup.16 q15, r2 ++ mov r3, #32-1 ++1: vld1.8 {q8, q9}, [r0] ++ add ip, r0, r1 ++ vaddw.u8 q0, q15, d16 ++ vaddw.u8 q1, q15, d17 ++ vaddw.u8 q2, q15, d18 ++ vaddw.u8 q3, q15, d19 ++1: ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d21, q1 ++ vqmovun.s16 d22, q2 ++ vqmovun.s16 d23, q3 ++ vld1.8 {q8, q9}, [ip], r1 ++ subs r3, #1 ++ vaddw.u8 q0, q15, d16 ++ vaddw.u8 q1, q15, d17 ++ vaddw.u8 q2, q15, d18 ++ vaddw.u8 q3, q15, d19 ++ vst1.8 {q10, q11}, [r0], r1 ++ bne 1b ++ ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d21, q1 ++ vqmovun.s16 d22, q2 ++ vqmovun.s16 d23, q3 ++ vst1.8 {q10, q11}, [r0] ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ U add ++ ++@ add_residual4x4_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc_v) [r3] ++ ++function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1 ++ add ip, r0, r2 ++ vld1.16 {q0, q1}, [r1] ++ lsl r2, #1 ++ vld1.8 {d16}, [r0 :64], r2 ++ vld1.8 {d17}, [ip :64], r2 ++ vld1.8 {d18}, [r0 :64] ++ sub r0, r2 ++ vld1.8 {d19}, [ip :64] ++ sub ip, r2 ++ vdup.16 q2, r3 ++ vdup.16 q3, r3 ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vzip.16 q0, q2 ++ vzip.16 q1, q3 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q2 ++ vqmovun.s16 d2, q1 ++ vqmovun.s16 d3, q3 ++ vst1.8 {d0}, [r0 :64], r2 ++ vst1.8 {d1}, [ip :64], r2 ++ vst1.8 {d2}, [r0 :64] ++ vst1.8 {d3}, [ip :64] ++ bx lr ++endfunc ++ ++@ add_residual8x8_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++@ int dc_v) [r3] ++ ++function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1 ++ vdup.16 q15, r3 ++ add ip, r0, r2 ++ push {r4, lr} ++ vld2.8 {d16, d17}, [r0 :128] ++ lsl r2, #1 ++ vld2.8 {d18, d19}, [ip :128] ++ mov r3, #8-2 ++ vld1.16 {q0, q1}, [r1 :256]! ++ add r4, r0, r2 ++ vmovl.u8 q10, d16 ++ add lr, ip, r2 ++ vmovl.u8 q11, d18 ++ vqadd.s16 q0, q10 ++ vaddw.u8 q2, q15, d17 ++ vqadd.s16 q1, q11 ++ vaddw.u8 q3, q15, d19 ++1: ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d21, q2 ++ vld2.8 {d16, d17}, [r4 :128], r2 ++ subs r3, #2 ++ vqmovun.s16 d22, q1 ++ vqmovun.s16 d23, q3 ++ vst2.8 {d20, d21}, [r0 :128], r2 ++ vld2.8 {d18, d19}, [lr :128], r2 ++ vst2.8 {d22, d23}, [ip :128], r2 ++ vld1.16 {q0, q1}, [r1 :256]! ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d18 ++ vqadd.s16 q0, q10 ++ vaddw.u8 q2, q15, d17 ++ vqadd.s16 q1, q11 ++ vaddw.u8 q3, q15, d19 ++ bne 1b ++ ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d21, q2 ++ vqmovun.s16 d22, q1 ++ vqmovun.s16 d23, q3 ++ vst2.8 {d20, d21}, [r0 :128] ++ vst2.8 {d22, d23}, [ip :128] ++ pop {r4, pc} ++endfunc ++ ++@ add_residual16x16_u( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++@ int dc_v) [r3] ++ ++function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1 ++ vdup.16 q15, r3 ++ add ip, r0, r2 ++ vld2.8 {q8, q9}, [r0 :256] ++ mov r3, #16-1 ++ vld1.16 {q0, q1}, [r1 :256]! ++ vmovl.u8 q11, d16 ++ vmovl.u8 q12, d17 ++ vqadd.s16 q0, q11 ++ vaddw.u8 q11, q15, d18 ++ vqadd.s16 q1, q12 ++ vaddw.u8 q12, q15, d19 ++1: ++ vld2.8 {q8, q9}, [ip :256], r2 ++ subs r3, #1 ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d22, q11 ++ vqmovun.s16 d21, q1 ++ vqmovun.s16 d23, q12 ++ vld1.16 {q0, q1}, [r1 :256]! ++ vst2.8 {q10, q11}, [r0 :256], r2 ++ vmovl.u8 q11, d16 ++ pldw [ip] ++ vmovl.u8 q12, d17 ++ vqadd.s16 q0, q11 ++ vaddw.u8 q11, q15, d18 ++ vqadd.s16 q1, q12 ++ vaddw.u8 q12, q15, d19 ++ bne 1b ++ ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d22, q11 ++ vqmovun.s16 d21, q1 ++ vqmovun.s16 d23, q12 ++ vst2.8 {q10, q11}, [r0 :256] ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ V add ++ ++@ add_residual4x4_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1 ++ add ip, r0, r2 ++ vld1.16 {q2, q3}, [r1] ++ lsl r2, #1 ++ vld1.8 {d16}, [r0 :64], r2 ++ vld1.8 {d17}, [ip :64], r2 ++ vld1.8 {d18}, [r0 :64] ++ sub r0, r2 ++ vld1.8 {d19}, [ip :64] ++ sub ip, r2 ++ vdup.16 q0, r3 ++ vdup.16 q1, r3 ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vzip.16 q0, q2 ++ vzip.16 q1, q3 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q2 ++ vqmovun.s16 d2, q1 ++ vqmovun.s16 d3, q3 ++ vst1.8 {d0}, [r0 :64], r2 ++ vst1.8 {d1}, [ip :64], r2 ++ vst1.8 {d2}, [r0 :64] ++ vst1.8 {d3}, [ip :64] ++ bx lr ++endfunc ++ ++@ add_residual8x8_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1 ++ vdup.16 q15, r3 ++ add ip, r0, r2 ++ push {r4, lr} ++ vld2.8 {d16, d17}, [r0 :128] ++ lsl r2, #1 ++ vld2.8 {d18, d19}, [ip :128] ++ mov r3, #8-2 ++ vld1.16 {q0, q1}, [r1 :256]! ++ add r4, r0, r2 ++ vmovl.u8 q10, d17 ++ add lr, ip, r2 ++ vmovl.u8 q11, d19 ++ vqadd.s16 q0, q10 ++ vaddw.u8 q2, q15, d16 ++ vqadd.s16 q1, q11 ++ vaddw.u8 q3, q15, d18 ++1: ++ vqmovun.s16 d20, q2 ++ vqmovun.s16 d21, q0 ++ vld2.8 {d16, d17}, [r4 :128], r2 ++ subs r3, #2 ++ vqmovun.s16 d22, q3 ++ vqmovun.s16 d23, q1 ++ vst2.8 {d20, d21}, [r0 :128], r2 ++ vld2.8 {d18, d19}, [lr :128], r2 ++ vst2.8 {d22, d23}, [ip :128], r2 ++ vld1.16 {q0, q1}, [r1 :256]! ++ vmovl.u8 q10, d17 ++ vmovl.u8 q11, d19 ++ vqadd.s16 q0, q10 ++ vaddw.u8 q2, q15, d16 ++ vqadd.s16 q1, q11 ++ vaddw.u8 q3, q15, d18 ++ bne 1b ++ ++ vqmovun.s16 d20, q2 ++ vqmovun.s16 d21, q0 ++ vqmovun.s16 d22, q3 ++ vqmovun.s16 d23, q1 ++ vst2.8 {d20, d21}, [r0 :128] ++ vst2.8 {d22, d23}, [ip :128] ++ pop {r4, pc} ++endfunc ++ ++@ add_residual16x16_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1 ++ vdup.16 q15, r3 ++ add ip, r0, r2 ++ vld2.8 {q8, q9}, [r0 :256] ++ mov r3, #16-1 ++ vld1.16 {q0, q1}, [r1 :256]! ++ vmovl.u8 q11, d18 ++ vmovl.u8 q12, d19 ++ vqadd.s16 q0, q11 ++ vaddw.u8 q11, q15, d16 ++ vqadd.s16 q1, q12 ++ vaddw.u8 q12, q15, d17 ++1: ++ vld2.8 {q8, q9}, [ip :256], r2 ++ subs r3, #1 ++ vqmovun.s16 d20, q11 ++ vqmovun.s16 d22, q0 ++ vqmovun.s16 d21, q12 ++ vqmovun.s16 d23, q1 ++ vld1.16 {q0, q1}, [r1 :256]! ++ vst2.8 {q10, q11}, [r0 :256], r2 ++ vmovl.u8 q11, d18 ++ pldw [ip] ++ vmovl.u8 q12, d19 ++ vqadd.s16 q0, q11 ++ vaddw.u8 q11, q15, d16 ++ vqadd.s16 q1, q12 ++ vaddw.u8 q12, q15, d17 ++ bne 1b ++ ++ vqmovun.s16 d20, q11 ++ vqmovun.s16 d22, q0 ++ vqmovun.s16 d21, q12 ++ vqmovun.s16 d23, q1 ++ vst2.8 {q10, q11}, [r0 :256] ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ U & V add ++ ++@ add_residual4x4_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1 ++ add ip, r0, r2 ++ vld1.16 {q0, q1}, [r1]! @ all of U ++ lsl r2, #1 ++ vld1.8 {d16}, [r0 :64], r2 ++ rsb r3, r2, #0 ++ vld1.8 {d17}, [ip :64], r2 ++ vld1.16 {q2, q3}, [r1] @ all of V ++ vld1.8 {d18}, [r0 :64], r3 ++ vld1.8 {d19}, [ip :64], r3 ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vzip.16 q0, q2 ++ vzip.16 q1, q3 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q2 ++ vqmovun.s16 d2, q1 ++ vqmovun.s16 d3, q3 ++ vst1.8 {d0}, [r0 :64], r2 ++ vst1.8 {d1}, [ip :64], r2 ++ vst1.8 {d2}, [r0 :64] ++ vst1.8 {d3}, [ip :64] ++ bx lr ++endfunc ++ ++@ add_residual8x8_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1 ++ vld2.8 {d16, d17}, [r0 :128] ++ add r3, r1, #(8*8*2) @ Offset to V ++ vld1.16 {q0}, [r1 :128]! ++ add ip, r0, r2 ++ vld1.16 {q1}, [r3 :128]! ++ vmovl.u8 q10, d16 ++ push {lr} ++ vmovl.u8 q8, d17 ++ mov lr, #8-1 ++ vqadd.s16 q10, q0 ++ vqadd.s16 q1, q8 ++1: ++ vld2.8 {d16, d17}, [ip :128], r2 ++ subs lr, #1 ++ vld1.16 {q0}, [r1 :128]! ++ vqmovun.s16 d20, q10 ++ vqmovun.s16 d21, q1 ++ vld1.16 {q1}, [r3 :128]! ++ vst2.8 {d20, d21}, [r0 :128], r2 ++ vmovl.u8 q10, d16 ++ pldw [ip] ++ vmovl.u8 q8, d17 ++ vqadd.s16 q10, q0 ++ vqadd.s16 q1, q8 ++ bne 1b ++ ++ vqmovun.s16 d20, q10 ++ vqmovun.s16 d21, q1 ++ vst2.8 {d20, d21}, [r0 :128] ++ pop {pc} ++endfunc ++ ++@ add_residual16x16_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1 ++ vld2.8 {q8, q9}, [r0 :256] ++ add r3, r1, #(16*16*2) @ Offset to V ++ vld1.16 {q0, q1}, [r1 :256]! ++ add ip, r0, r2 ++ vld1.16 {q2, q3}, [r3 :256]! ++ vmovl.u8 q10, d16 ++ push {lr} ++ vmovl.u8 q8, d17 ++ mov lr, #16-1 ++ vmovl.u8 q11, d18 ++ vmovl.u8 q9, d19 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q8 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q9 ++1: ++ vld2.8 {q8, q9}, [ip :256], r2 ++ subs lr, #1 ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d22, q2 ++ vqmovun.s16 d21, q1 ++ vqmovun.s16 d23, q3 ++ vld1.16 {q0, q1}, [r1 :256]! ++ vst2.8 {d20-d23}, [r0 :256], r2 ++ vld1.16 {q2, q3}, [r3 :256]! ++ vmovl.u8 q10, d16 ++ pldw [ip] ++ vmovl.u8 q8, d17 ++ vmovl.u8 q11, d18 ++ vmovl.u8 q9, d19 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q8 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q9 ++ bne 1b ++ ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d22, q2 ++ vqmovun.s16 d21, q1 ++ vqmovun.s16 d23, q3 ++ vst2.8 {d20-d23}, [r0 :256] ++ pop {pc} ++endfunc ++ ++@ 32x32 chroma never occurs so NIF ++ ++@ ============================================================================ +diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S +new file mode 100644 +index 0000000000..b56e0f9644 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S +@@ -0,0 +1,2245 @@ ++/* ++ * Copyright (c) 2014 - 2015 Seppo Tomperi ++ * 2017 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++.set EDGE_SRC_STRIDE, 160 ++ ++@ PIC jump tables are fractionally more expensive than absolute in our code ++.set jent_pic, CONFIG_PIC ++ ++ ++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4 ++ vshr.u8 q12, q8, #3 ++ \I1 ++ vadd.i8 q8, \Q_K128 ++ \I2 ++ vshr.u8 q13, q9, #3 ++ \I3 ++ vadd.i8 q9, \Q_K128 ++ \I4 ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT0, d25 ++ vtbl.8 d26, \XLAT1, d26 ++ vtbl.8 d27, \XLAT1, d27 ++ ++ vqadd.s8 q8, q12 ++ vshr.u8 q12, q10, #3 ++ vadd.i8 q10, \Q_K128 ++ vqadd.s8 q9, q13 ++ vshr.u8 q13, q11, #3 ++ vadd.i8 q11, \Q_K128 ++ ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT0, d25 ++ vtbl.8 d26, \XLAT1, d26 ++ vtbl.8 d27, \XLAT1, d27 ++ vqadd.s8 q10, q12 ++ vsub.i8 q8, \Q_K128 ++ vqadd.s8 q11, q13 ++ vsub.i8 q9, \Q_K128 ++ vsub.i8 q10, \Q_K128 ++ vsub.i8 q11, \Q_K128 ++.endm ++ ++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4 ++ \L1 ++ \L2 ++ \L3 ++ \L4 ++ \L5 ++ vadd.i8 q12, q8, \Q_K128 ++ vshr.u8 q8, #3 ++ vtbl.8 d16, \XLAT0, d16 ++ vtbl.8 d17, \XLAT1, d17 ++ vqadd.s8 q12, q8 ++ bmi 2f ++1: \L1 ++ \L2 ++ \L3 ++ \L4 ++ \L5 ++ vsub.i8 q13, q12, \Q_K128 ++ vadd.i8 q12, q8, \Q_K128 ++ vshr.u8 q8, #3 ++ \S1 ++ \S2 ++ \S3 ++ \S4 ++ vtbl.8 d16, \XLAT0, d16 ++ vtbl.8 d17, \XLAT1, d17 ++ vqadd.s8 q12, q8 ++ bpl 1b ++2: vsub.i8 q13, q12, \Q_K128 ++ \S1 ++ \S2 ++ \S3 ++ \S4 ++.endm ++ ++ ++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX ++ vmax.s16 \Q0, \Q_MIN ++ vmax.s16 \Q1, \Q_MIN ++ vmax.s16 \Q2, \Q_MIN ++ vmax.s16 \Q3, \Q_MIN ++ vmin.s16 \Q0, \Q_MAX ++ vmin.s16 \Q1, \Q_MAX ++ vmin.s16 \Q2, \Q_MAX ++ vmin.s16 \Q3, \Q_MAX ++.endm ++ ++@ Clobbers q12, q13 ++.macro sao_band_64b_16 Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2 ++ vshrn.i16 d24, \Q0, #(\bit_depth - 5) ++ vshrn.i16 d25, \Q1, #(\bit_depth - 5) ++ vshrn.i16 d26, \Q2, #(\bit_depth - 5) ++ \I1 ++ vtbl.8 d24, \XLAT0, d24 ++ vshrn.i16 d27, \Q3, #(\bit_depth - 5) ++ vtbl.8 d25, \XLAT1, d25 ++ \I2 ++ vtbl.8 d26, \XLAT0, d26 ++ vtbl.8 d27, \XLAT1, d27 ++ vaddw.s8 \Q0, d24 ++ vaddw.s8 \Q1, d25 ++ vaddw.s8 \Q2, d26 ++ vaddw.s8 \Q3, d27 ++ clip16_4 \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX ++.endm ++ ++@ Clobbers q10, q11, q12 ++.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4 ++ \L1 ++ \L2 ++ \L3 ++ \L4 ++ \L5 ++ vshrn.i16 d24, \Q0, #\bit_depth - 5 ++ vshrn.i16 d25, \Q1, #\bit_depth - 5 ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT1, d25 ++ vaddw.s8 q10, \Q0, d24 ++ vaddw.s8 q11, \Q1, d25 ++ bmi 2f ++1: \L1 ++ \L2 ++ \L3 ++ \L4 ++ \L5 ++ vmax.s16 q10, \Q_MIN ++ vmax.s16 q11, \Q_MIN ++ vshrn.i16 d24, \Q0, #\bit_depth - 5 ++ vshrn.i16 d25, \Q1, #\bit_depth - 5 ++ vmin.s16 q10, \Q_MAX ++ vmin.s16 q11, \Q_MAX ++ \S1 ++ \S2 ++ \S3 ++ \S4 ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT1, d25 ++ vaddw.s8 q10, \Q0, d24 ++ vaddw.s8 q11, \Q1, d25 ++ bpl 1b ++2: vmax.s16 q10, \Q_MIN ++ vmax.s16 q11, \Q_MIN ++ vmin.s16 q10, \Q_MAX ++ vmin.s16 q11, \Q_MAX ++ \S1 ++ \S2 ++ \S3 ++ \S4 ++.endm ++ ++ ++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38) ++@ so we are quite safe stuffing it into a byte array ++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma ++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of ++@ precision ++ ++@ This, somewhat nasty, bit of code builds the {d0-d3} translation ++@ array via the stack ++@ Given that sao_left_class > 28 can cause wrap we can't just poke ++@ all 4 bytes in at once ++@ ++@ It also loads other common regs ++ ++@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately ++function band_load_y ++ ldr ip, [sp, #16] @ &sao_offset_val[0] ++ ldr r4, [sp, #20] @ sao_left_class ++ vmov.i64 d4, #0 ++ vmov.i64 q0, #0 ++ pld [r1] ++ vld2.8 {q8}, [ip] ++ sub ip, sp, #8*5 ++ vmov.i64 q1, #0 ++ add r4, ip, r4 ++ vpush {d0-d4} @ Put zero array on stack ++ vshr.u64 d16, d16, #8 @ 1st interesting val is [1] ++ ldr ip, [ip, #8*5 + 28] @ height ++ vst1.32 {d16[0]}, [r4] ++ add r4, r1, r3 ++ vpop {d0-d4} @ Pop modified array ++ sub ip, ip, #1 ++ vorr d0, d0, d4 ++ bx lr ++endfunc ++ ++@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately ++function band_load_c ++ ldr ip, [sp, #16] @ &sao_offset_val1[0] ++ ldr r4, [sp, #20] @ sao_left_class1 ++ vmov.i64 d24, #0 ++ vmov.i64 q10, #0 ++ pld [r1] ++ vld2.8 {q8}, [ip] ++ sub ip, sp, #8*5 ++ vmov.i64 q11, #0 ++ add r4, ip, r4 ++ ldr ip, [sp, #24] @ &sao_offset_val2[0] ++ vpush {d20-d24} @ Put zero array on stack ++ vld2.8 {q9}, [ip] ++ vshr.u64 d16, d16, #8 @ 1st interesting val is [1] ++ ldr ip, [sp, #8*5 + 28] @ sao_left_class2 ++ vst1.32 {d16[0]}, [r4] ++ add ip, sp, ip ++ vshr.u64 d18, d18, #8 @ 1st interesting val is [1] ++ vldmia sp, {d0-d3} @ Load modified array ++ vldr d16, [sp, #8*4] ++ add r4, r1, r3 ++ vstmia sp, {d20-d24} @ Put zero array on stack (again) ++ vst1.32 {d18[0]}, [ip] ++ vorr d0, d0, d16 ++ vldmia sp, {d4-d7} @ Load modified array ++ vldr d18, [sp, #8*4] ++ ldr ip, [sp, #8*5 + 36] @ height ++ add sp, sp, #8*5 ++ vorr d4, d4, d18 ++ sub ip, ip, #1 ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_band_64_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_band_64_neon_8, export=1 ++ push {r4-r6, lr} ++ vmov.u8 q15, #128 ++ bl band_load_y ++ ++1: vldmia r1, {q8-q11} ++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \ ++ "pld [r4]", \ ++ "subs ip, #1", \ ++ "it ne; addne r4, r3", \ ++ "add r1, r3" ++ vstmia r0, {q8-q11} ++ add r0, r2 ++ bpl 1b ++ ++ pop {r4-r6, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_32_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_band_32_neon_8, export=1 ++ push {r4-r6, lr} ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ vmov.u8 q15, #128 ++ bl band_load_y ++ ++1: vld1.8 { q8, q9 }, [r1, :128], r3 ++ subs ip, #2 ++ vld1.8 {q10, q11}, [r6, :128], r3 ++ ++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15 ++ ++ vst1.8 { q8, q9 }, [r0, :128], r2 ++ vst1.8 {q10, q11}, [r5, :128], r2 ++ bpl 1b ++ ++ pop {r4-r6, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_16_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_band_16_neon_8, export=1 ++ push {r4-r6, lr} ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ vmov.u8 q15, #128 ++ bl band_load_y ++ ++1: vld1.8 { q8}, [r1, :128], r3 ++ subs ip, #4 ++ vld1.8 { q9}, [r6, :128], r3 ++ vld1.8 {q10}, [r1, :128], r3 ++ vld1.8 {q11}, [r6, :128], r3 ++ ++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15 ++ ++ vst1.8 { q8}, [r0, :128], r2 ++ vst1.8 { q9}, [r5, :128], r2 ++ vst1.8 {q10}, [r0, :128], r2 ++ vst1.8 {q11}, [r5, :128], r2 ++ bpl 1b ++ ++ pop {r4-r6, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_8_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_band_8_neon_8, export=1 ++ ldr ip, [sp, #8] @ width ++ push {r4-r6, lr} ++ vmov.u8 q15, #128 ++ cmp ip, #8 ++ bl band_load_y ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ blt 4f ++ ++ sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \ ++ "vld1.8 {d16}, [r1, :64], r3", \ ++ "subs ip, #2", \ ++ "vld1.8 {d17}, [r6, :64], r3", \ ++ "", \ ++ "", \ ++ "vst1.8 {d26}, [r0, :64], r2", \ ++ "vst1.8 {d27}, [r5, :64], r2" ++ pop {r4-r6, pc} ++4: ++ sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \ ++ "vld1.32 {d16[0]}, [r1, :32], r3", \ ++ "subs ip, #4", \ ++ "vld1.32 {d16[1]}, [r6, :32], r3", \ ++ "vld1.32 {d17[0]}, [r1, :32], r3", \ ++ "vld1.32 {d17[1]}, [r6, :32], r3", \ ++ "vst1.32 {d26[0]}, [r0, :32], r2", \ ++ "vst1.32 {d26[1]}, [r5, :32], r2", \ ++ "vst1.32 {d27[0]}, [r0, :32], r2", \ ++ "vst1.32 {d27[1]}, [r5, :32], r2" ++ pop {r4-r6, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_c_32_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++function ff_hevc_rpi_sao_band_c_32_neon_8, export=1 ++ push {r4-r6, lr} ++ add r5, r0, #32 ++ add r6, r1, #32 ++ vmov.u8 q15, #128 ++ bl band_load_c ++ ++1: vld2.8 { q8, q9 }, [r1, :128], r3 ++ subs ip, #1 ++ vld2.8 {q10, q11}, [r6, :128], r3 ++ ++ sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \ ++ "pld [r4]", \ ++ "it ne; addne r4, r3" ++ ++ vst2.8 { q8, q9 }, [r0, :128], r2 ++ vst2.8 {q10, q11}, [r5, :128], r2 ++ bpl 1b ++ ++ pop {r4-r6, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_c_16_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++function ff_hevc_rpi_sao_band_c_16_neon_8, export=1 ++ push {r4-r6, lr} ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ vmov.u8 q15, #128 ++ bl band_load_c ++ ++1: vld2.8 { q8, q9 }, [r1, :128], r3 ++ subs ip, #2 ++ vld2.8 {q10, q11}, [r6, :128], r3 ++ ++ sao_band_64b_8 {d0-d3}, {d4-d7}, q15 ++ ++ vst2.8 { q8, q9 }, [r0, :128], r2 ++ vst2.8 {q10, q11}, [r5, :128], r2 ++ bpl 1b ++ ++ pop {r4-r6, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_c_8_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++function ff_hevc_rpi_sao_band_c_8_neon_8, export=1 ++ ldr ip, [sp, #16] @ width ++ push {r4-r6, lr} ++ vmov.u8 q15, #128 ++ cmp ip, #8 ++ bl band_load_c ++ blt 4f ++ ++ sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \ ++ "vld2.8 {d16-d17}, [r1, :128], r3", \ ++ "subs ip, #1", \ ++ "", \ ++ "", \ ++ "", \ ++ "vst2.8 {d26-d27}, [r0, :128], r2" ++ pop {r4-r6, pc} ++4: ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \ ++ "vld1.8 {d16}, [r1, :64], r3", \ ++ "subs ip, #2", \ ++ "vld1.8 {d17}, [r6, :64], r3", \ ++ "vuzp.8 d16, d17", \ ++ "", \ ++ "vzip.8 d26, d27", \ ++ "vst1.8 {d26}, [r0, :64], r2", \ ++ "vst1.8 {d27}, [r5, :64], r2" ++ pop {r4-r6, pc} ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_band_64_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_64_16 bit_depth ++ push {r4-r6, lr} ++ vmov.i64 q2, #0 ++ vmov.i16 q3, #(1 << \bit_depth) - 1 ++ bl band_load_y ++ vpush {q4-q7} ++ ++1: vldm r1, {q4-q11} ++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \ ++ "subs ip, #1", \ ++ "add r1, r3" ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth ++ vstm r0, {q4-q11} ++ add r0, r2 ++ bpl 1b ++ ++ vpop {q4-q7} ++ pop {r4-r6, pc} ++.endm ++ ++function ff_hevc_rpi_sao_band_64_neon_10, export=1 ++ band_64_16 10 ++endfunc ++ ++@ ff_hevc_rpi_sao_band_32_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_32_16 bit_depth ++ push {r4-r6, lr} ++ vmov.i64 q2, #0 ++ vmov.i16 q3, #(1 << \bit_depth) - 1 ++ bl band_load_y ++ ++1: vldm r1, {q8-q11} ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \ ++ "subs ip, #1", \ ++ "add r1, r3" ++ vstm r0, {q8-q11} ++ add r0, r2 ++ bpl 1b ++ ++ pop {r4-r6, pc} ++.endm ++ ++function ff_hevc_rpi_sao_band_32_neon_10, export=1 ++ band_32_16 10 ++endfunc ++ ++@ ff_hevc_rpi_sao_band_16_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_16_16 bit_depth ++ push {r4-r6, lr} ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ vmov.i64 q14, #0 ++ vmov.i16 q15, #(1 << \bit_depth) - 1 ++ bl band_load_y ++ ++1: vld1.16 { q8, q9 }, [r1, :128], r3 ++ subs r12, #2 ++ vld1.16 {q10, q11}, [r6, :128], r3 ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth ++ vst1.16 { q8, q9 }, [r0, :128], r2 ++ vst1.16 {q10, q11}, [r5, :128], r2 ++ bpl 1b ++ ++ pop {r4-r6, pc} ++.endm ++ ++function ff_hevc_rpi_sao_band_16_neon_10, export=1 ++ band_16_16 10 ++endfunc ++ ++@ ff_hevc_rpi_sao_band_8_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_8_16 bit_depth ++ ldr ip, [sp, #8] @ width ++ push {r4-r6, lr} ++ vmov.i64 q14, #0 ++ cmp ip, #8 ++ vmov.i16 q15, #(1 << \bit_depth) - 1 ++ bl band_load_y ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ blt 4f ++ ++ sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \ ++ "vld1.16 {q8}, [r1, :128], r3", \ ++ "subs ip, #2", \ ++ "vld1.16 {q9}, [r6, :128], r3", \ ++ "", \ ++ "", \ ++ "vst1.16 {q10}, [r0, :128], r2", \ ++ "vst1.16 {q11}, [r5, :128], r2" ++ pop {r4-r6, pc} ++4: ++ sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \ ++ "vld1.16 {d16}, [r1, :64], r3", \ ++ "subs ip, #4", \ ++ "vld1.16 {d17}, [r6, :64], r3", \ ++ "vld1.16 {d18}, [r1, :64], r3", \ ++ "vld1.16 {d19}, [r6, :64], r3", \ ++ "vst1.16 {d20}, [r0, :64], r2", \ ++ "vst1.16 {d21}, [r5, :64], r2", \ ++ "vst1.16 {d22}, [r0, :64], r2", \ ++ "vst1.16 {d23}, [r5, :64], r2" ++ pop {r4-r6, pc} ++.endm ++ ++function ff_hevc_rpi_sao_band_8_neon_10, export=1 ++ band_8_16 10 ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_band_c_32_neon_10( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++.macro band_c_32_16 bit_depth ++ push {r4-r6, lr} ++ add r5, r0, #32 ++ add r6, r1, #32 ++ sub r2, #64 ++ sub r3, #64 ++ vmov.i64 q14, #0 ++ vmov.i16 q15, #(1 << \bit_depth) - 1 ++ bl band_load_c ++ mov lr, #64 ++ vpush {q4-q7} ++ ++1: vld2.16 { q4, q5 }, [r1, :128], lr ++ subs ip, #1 ++ vld2.16 { q6, q7 }, [r6, :128], lr ++ vld2.16 { q8, q9 }, [r1, :128], r3 ++ vld2.16 {q10, q11}, [r6, :128], r3 ++ ++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ ++ "pld [r4]", \ ++ "it ne; addne r4, r3" ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth ++ ++ vst2.16 { q4, q5 }, [r0, :128], lr ++ vst2.16 { q6, q7 }, [r5, :128], lr ++ vst2.16 { q8, q9 }, [r0, :128], r2 ++ vst2.16 {q10, q11}, [r5, :128], r2 ++ ++ bpl 1b ++ ++ vpop {q4-q7} ++ pop {r4-r6, pc} ++.endm ++ ++function ff_hevc_rpi_sao_band_c_32_neon_10, export=1 ++ band_c_32_16 10 ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_band_c_16_neon_10( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++.macro band_c_16_16 bit_depth ++ push {r4-r6, lr} ++ add r5, r0, #32 ++ add r6, r1, #32 ++ vmov.i64 q14, #0 ++ vmov.i16 q15, #(1 << \bit_depth) - 1 ++ bl band_load_c ++ ++1: vld2.16 { q8, q9 }, [r1, :128], r3 ++ subs ip, #1 ++ vld2.16 {q10, q11}, [r6, :128], r3 ++ ++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth ++ ++ vst2.16 { q8, q9 }, [r0, :128], r2 ++ vst2.16 {q10, q11}, [r5, :128], r2 ++ ++ bpl 1b ++ pop {r4-r6, pc} ++.endm ++ ++function ff_hevc_rpi_sao_band_c_16_neon_10, export=1 ++ band_c_16_16 10 ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_band_c_8_neon_10( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++.macro band_c_8_16 bit_depth ++ ldr ip, [sp, #16] @ width ++ push {r4-r6, lr} ++ vmov.i64 q14, #0 ++ cmp ip, #8 ++ vmov.i16 q15, #(1 << \bit_depth) - 1 ++ bl band_load_c ++ blt 4f ++ ++ sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ ++ "vld2.16 {q8,q9}, [r1, :128], r3", \ ++ "subs ip, #1", \ ++ "", \ ++ "", \ ++ "", \ ++ "vst2.16 {q10,q11}, [r0, :128], r2" ++ pop {r4-r6, pc} ++4: ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ ++ "vld2.16 {d16,d18}, [r1, :128], r3", \ ++ "subs ip, #2", \ ++ "vld2.16 {d17,d19}, [r6, :128], r3", \ ++ "", \ ++ "", \ ++ "vst2.16 {d20,d22}, [r0, :128], r2", \ ++ "vst2.16 {d21,d23}, [r5, :128], r2" ++ pop {r4-r6, pc} ++.endm ++ ++function ff_hevc_rpi_sao_band_c_8_neon_10, export=1 ++ band_c_8_16 10 ++endfunc ++ ++ ++@ ============================================================================= ++@ SAO EDGE ++ ++@ r0 destination address ++@ r2 stride to post-increment r0 with ++@ [r5] translate values ++@ ++@ a <- c <- b ++@ a in q0 - q3 ++@ c in q4 - q7 ++@ b in q8 - q11 ++@ ++@ q12-15 used as temp ++@ ++@ Can be used for both Y & C as we unzip/zip the deltas and ++@ transform "u/v" separately via d26/d27. For Y d26=d27 ++ ++function edge_64b_body_8 ++ ++ vcgt.u8 q12, q4, q0 @ c > a -> -1 , otherwise 0 ++ vcgt.u8 q13, q5, q1 ++ vcgt.u8 q14, q6, q2 ++ vcgt.u8 q15, q7, q3 ++ ++ vcgt.u8 q0, q4 @ a > c -> -1 , otherwise 0 ++ vcgt.u8 q1, q5 ++ vcgt.u8 q2, q6 ++ vcgt.u8 q3, q7 ++ ++ vsub.s8 q0, q12 @ a = sign(c-a) ++ vsub.s8 q1, q13 ++ vsub.s8 q2, q14 ++ vsub.s8 q3, q15 ++ ++ vcgt.u8 q12, q4, q8 @ c > b -> -1 , otherwise 0 ++ vcgt.u8 q13, q5, q9 ++ vcgt.u8 q14, q6, q10 ++ vcgt.u8 q15, q7, q11 ++ ++ vsub.s8 q0, q12 ++ vsub.s8 q1, q13 ++ vsub.s8 q2, q14 ++ vsub.s8 q3, q15 ++ ++ vcgt.u8 q12, q8, q4 @ c < b -> -1 , otherwise 0 ++ vcgt.u8 q13, q9, q5 ++ vcgt.u8 q14, q10, q6 ++ vcgt.u8 q15, q11, q7 ++ ++ vadd.s8 q0, q12 @ a = sign(c-a) + sign(c-b) ++ vadd.s8 q1, q13 ++ vmov.u8 q12, #2 ++ vadd.s8 q2, q14 ++ vadd.s8 q3, q15 ++ ++ vadd.s8 q0, q12 ++ vadd.s8 q1, q12 ++ ++ vld1.8 {d26, d27}, [r5] ++ ++ vadd.s8 q2, q12 ++ vuzp.8 q0, q1 ++ vmov.u8 q15, #128 ++ vadd.s8 q3, q12 @ a = 2 + sign(c-a) + sign(c-b) ++ ++ vtbl.8 d0, {d26}, d0 ++ vadd.s8 q12, q4, q15 @ Add -128 so we can use saturating signed add ++ ++ vtbl.8 d1, {d26}, d1 ++ vadd.s8 q14, q5, q15 ++ ++ vtbl.8 d2, {d27}, d2 ++ vuzp.8 q2, q3 ++ ++ vtbl.8 d3, {d27}, d3 ++ ++ vtbl.8 d4, {d26}, d4 ++ vzip.8 q0, q1 ++ ++ vtbl.8 d5, {d26}, d5 ++ vqadd.s8 q0, q12 ++ vqadd.s8 q1, q14 ++ vadd.s8 q12, q6, q15 @ Add -128 so we can use saturating signed add ++ ++ vtbl.8 d6, {d27}, d6 ++ vtbl.8 d7, {d27}, d7 ++ vadd.s8 q14, q7, q15 @ Add -128 so we can use saturating signed add ++ vzip.8 q2, q3 ++ ++ vsub.s8 q0, q15 ++ vqadd.s8 q2, q12 ++ vqadd.s8 q3, q14 ++ vsub.s8 q1, q15 ++ vsub.s8 q2, q15 ++ vsub.s8 q3, q15 ++ ++ bx lr ++endfunc ++ ++@ r0 destination address ++@ r2 stride to post-increment r0 with ++@ r4 upper clip value ++@ [r5] translate values ++@ ++@ a <- c <- b ++@ a in q0 - q3 ++@ c in q4 - q7 ++@ b in q8 - q11 ++@ ++@ q12-15 used as temp ++@ ++@ Can be used for both Y & C as we unzip/zip the deltas and ++@ transform "u/v" separately via d26/d27. For Y d26=d27 ++ ++function edge_64b_body_16 ++ ++ vcgt.u16 q12, q4, q0 // c > a -> -1 , otherwise 0 ++ vcgt.u16 q13, q5, q1 ++ vcgt.u16 q14, q6, q2 ++ vcgt.u16 q15, q7, q3 ++ ++ vcgt.u16 q0, q0, q4 // a > c -> -1 , otherwise 0 ++ vcgt.u16 q1, q1, q5 ++ vcgt.u16 q2, q2, q6 ++ vcgt.u16 q3, q3, q7 ++ ++ vsub.s16 q0, q0, q12 // a = sign(c-a) ++ vsub.s16 q1, q1, q13 ++ vsub.s16 q2, q2, q14 ++ vsub.s16 q3, q3, q15 ++ ++ vcgt.u16 q12, q4, q8 // c > b -> -1 , otherwise 0 ++ vcgt.u16 q13, q5, q9 ++ vcgt.u16 q14, q6, q10 ++ vcgt.u16 q15, q7, q11 ++ ++ vsub.s16 q0, q0, q12 ++ vsub.s16 q1, q1, q13 ++ vsub.s16 q2, q2, q14 ++ vsub.s16 q3, q3, q15 ++ ++ vcgt.u16 q12, q8, q4 // c < b -> -1 , otherwise 0 ++ vcgt.u16 q13, q9, q5 ++ vcgt.u16 q14, q10, q6 ++ vcgt.u16 q15, q11, q7 ++ ++ vadd.s16 q0, q0, q12 // a = sign(c-a) + sign(c-b) ++ vadd.s16 q1, q1, q13 ++ vadd.s16 q2, q2, q14 ++ vadd.s16 q3, q3, q15 ++ ++ vmov.u8 q12, #2 ++ ++ vmovn.s16 d0, q0 ++ vmovn.s16 d1, q1 ++ vmovn.s16 d2, q2 ++ vmovn.s16 d3, q3 ++ ++ vldr d26, [r5] ++ ++ vuzp.8 q0, q1 ++ ++ vldr d27, [r5, #8] ++ ++ vadd.s8 q0, q0, q12 ++ vadd.s8 q1, q1, q12 ++ ++ vmov.i64 q12, #0 ++ ++ vtbl.8 d0, {d26}, d0 ++ vtbl.8 d1, {d26}, d1 ++ vtbl.8 d2, {d27}, d2 ++ vtbl.8 d3, {d27}, d3 ++ ++ vdup.i16 q13, r4 ++ ++ vzip.8 q0, q1 ++ ++ @ Avoid overwrite whilst widening ++ vaddw.s8 q2, q6, d2 ++ vaddw.s8 q3, q7, d3 ++ vaddw.s8 q1, q5, d1 ++ vaddw.s8 q0, q4, d0 ++ ++ @ now clip ++ clip16_4 q2, q3, q1, q0, q12, q13 ++ ++ bx lr ++endfunc ++ ++ ++@ a <- c <- b ++@ a in q0 ++@ c in q1 ++@ b in q2 ++@ Temp q3, q9, q10 ++@ ++@ d16, d17 (q8) xlat U, V ++@ q14.u8 #2 ++@ q15.u8 #128 ++ ++function edge_16b_body_8 ++ vcgt.u8 q9, q0, q1 @ a > c -> -1 , otherwise 0 ++ vadd.u8 q9, q14, q9 ++ vcgt.u8 q0, q1, q0 @ c > a -> -1 , otherwise 0 ++ vsub.u8 q9, q9, q0 ++ vcgt.u8 q0, q2, q1 @ c < b -> -1 , otherwise 0 ++ vadd.u8 q9, q9, q0 ++ vcgt.u8 q0, q1, q2 @ c > b -> -1 , otherwise 0 ++ vsub.u8 q0, q9, q0 ++ ++ vadd.s8 q3, q1, q15 @ Add -128 so we can use saturating signed add ++ ++ vuzp.8 d0, d1 ++ ++ vtbl.8 d0, {d16}, d0 ++ vtbl.8 d1, {d17}, d1 ++ ++ vzip.8 d0, d1 ++ vqadd.s8 q0, q3 ++ vsub.s8 q0, q15 ++ ++ bx lr ++endfunc ++ ++@ a <- c <- b ++@ a in q0 ++@ c in q1 ++@ b in q2 ++@ Temp q3 ++@ ++@ q12, #0 ++@ d16, d17 xlat U, V ++@ q14.u8 #2 ++@ q15.u16 max ++function edge_16b_body_16 ++ vcgt.u16 q9, q0, q1 @ a > c -> -1 , otherwise 0 ++ vadd.u16 q9, q14, q9 ++ vcgt.u16 q0, q1, q0 @ c > a -> -1 , otherwise 0 ++ vsub.u16 q9, q9, q0 ++ vcgt.u16 q0, q2, q1 @ c < b -> -1 , otherwise 0 ++ vadd.u16 q9, q9, q0 ++ vcgt.u16 q0, q1, q2 @ c > b -> -1 , otherwise 0 ++ vsub.u16 q0, q9, q0 ++ ++ vmovn.s16 d0, q0 ++ @ d1 will have random contents that we transform but ++ @ that doesn't matter as we then discard them ++ vuzp.8 d0, d1 ++ ++ vtbl.8 d0, {d16}, d0 ++ vtbl.8 d1, {d17}, d1 ++ ++ vzip.8 d0, d1 ++ ++ vaddw.s8 q0, q1, d0 ++ ++ @ now clip ++ vmax.s16 q0, q12 ++ vmin.s16 q0, q15 ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_edge_[c_]xx_neon( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] // Chroma only ++@ int eo, [sp, #sp_base + 0] ++@ int width, [sp, #sp_base + 4] ++@ int height) [sp, #sp_base + 8] ++ ++@ Jumps via jump_tab with ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ EDGE_SRC_STRIDE [r3] ++@ (1 << \bit_depth) - 1 [r4] ++@ * xlat_table [r5] // setup_64b only ++@ int height [r12] ++@ ++@ 0 [q12] // > 8 bit ++@ 2 [q14] ++@ 128 [q15] // = 8 bit ++@ r4 [q15] // > 8 bit ++ ++.macro edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0, xjump = 0 ++ ++@ Build translate registers ++@ As translate values can only be 0-4 we don't care about junk in the rest ++@ of the register ++.if \is_chroma ++ ldr ip, [sp, #0] ++ push {r4-r6, lr} @ 16 bytes ++ vld1.8 {d16[2]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[2]}, [ip] ++ add ip, ip, #2 ++ vld1.8 {d16[0]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[0]}, [ip] ++ add ip, ip, #2 ++ vld1.8 {d16[1]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[1]}, [ip] ++ add ip, ip, #2 ++ vld1.8 {d16[3]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[3]}, [ip] ++ add ip, ip, #2 ++ vld1.8 {d16[4]}, [r3] ++ vld1.8 {d17[4]}, [ip] ++ movw r3, EDGE_SRC_STRIDE ++.set sp_base, 20 ++.else ++ add ip, r3, #4 ++ vld1.8 {d16[1]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[0]}, [ip] ++ add ip, ip, #2 ++ vld1.8 {d16[0]}, [r3] ++ add r3, r3, #6 ++ vld1.8 {d17[1]}, [ip] ++ vld1.8 {d16[2]}, [r3] ++ movw r3, EDGE_SRC_STRIDE ++ push {r4-r6, lr} @ 16 bytes ++ vzip.8 d16, d17 ++ vmov d17, d16 ++.set sp_base, 16 ++.endif ++ ++@ If setup_64b we need the xlat table on the stack ++.if \setup_64b ++ sub r5, sp, #16 ++.endif ++ ++@ Get jump address ++@ We have a special case for width 4 as the calling code doesn't detect it ++@ If we may have w4 then we add a 2nd jump table after the 1st ++.if \check_w4 ++ ldr r12, [sp, #sp_base + 4] @ width ++ adr r6, \jump_tab ++ ldr lr, [sp, #sp_base + 0] @ e0 ++ cmp r12, #8 ++ it lt ++ addlt r6, #16 ++.else ++ ldr lr, [sp, #sp_base + 0] @ e0 ++ adr r6, \jump_tab ++.endif ++ ++ ldr r12, [sp, #sp_base + 8] @ height ++ ++.if \bit_depth > 8 ++ movw r4, (1 << \bit_depth) - 1 ++.endif ++.if \setup_16b ++.if \bit_depth > 8 ++ vmov.i64 q12, #0 ++ vdup.16 q15, r4 ++ vmov.u16 q14, #2 ++.else ++ vmov.u8 q15, #128 ++ vmov.u8 q14, #2 ++.endif ++.endif ++ ++@ If setup_64b we need q4-q7 saved. ++.if \setup_64b ++ vpush {q4-q8} @ 80 bytes, q8 pushed first ++.set sp_base, sp_base + 80 ++.endif ++ ++ ldr r6, [r6, lr, lsl #2] ++ ++@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes ++.if \do2 ++ push {r0, r1, r6, r12} ++.if jent_pic ++ bl 98f ++.else ++ blx r6 ++.endif ++ pop {r0, r1, r6, r12} ++ ++ add r0, #64 ++ add r1, #64 ++.endif ++ ++.if jent_pic ++ bl 98f ++.else ++ blx r6 ++.endif ++ ++@ Tidy up & return ++.if \setup_64b ++ vpop {q4-q8} @ spurious but harmless load of q8 ++.endif ++ pop {r4-r6, pc} ++ ++.if jent_pic && !\xjump ++@ Magic label - used as 98b in jent macro ++98: ++ add pc, r6 ++.endif ++.endm ++ ++ ++.macro edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab ++ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1 ++.endm ++ ++.macro edge_64b_init, bit_depth, is_chroma, do2, jump_tab, xjump=0 ++ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1, xjump=\xjump ++.endm ++ ++ ++.macro edge_64b_e0, body_fn, pb ++ sub r1, #8 ++ mov r6, lr ++1: vldm r1, {d7-d16} ++ // load a ++ vext.8 q0, q3, q4, #(16 - \pb) ++ add r1, r3 ++ vext.8 q1, q4, q5, #(16 - \pb) ++ subs r12, #1 ++ vext.8 q2, q5, q6, #(16 - \pb) ++ vext.8 q3, q6, q7, #(16 - \pb) ++ pld [r1] ++ // load b ++ vext.8 q11, q7, q8, #\pb @ Avoid overwrite ++ pld [r1, #64] ++ vext.8 q8, q4, q5, #\pb ++ vext.8 q9, q5, q6, #\pb ++ vext.8 q10, q6, q7, #\pb ++ bl \body_fn ++ vstm r0, {q0-q3} ++ add r0, r0, r2 ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_32bx2_e0, body_fn, pb ++ add r6, r1, r3 ++ push {r7,lr} ++ sub r1, #8 ++ add r7, r0, r2 ++ lsl r2, #1 ++1: vldmia r1, {d7-d12} ++ // load a ++ vext.8 q0, q3, q4, #16 - \pb ++ add r1, r1, r3, lsl #1 ++ vext.8 q1, q4, q5, #16 - \pb ++ subs r12, #2 ++ // load b ++ vext.8 q8, q4, q5, #\pb ++ vext.8 q9, q5, q6, #\pb ++ vldr d25, [r6, #-8] ++ vldmia r6, {d12-d15} ++ vldr d26, [r6, #32] ++ // load a ++ vext.8 q2, q12, q6, #16 - \pb ++ add r6, r6, r3, lsl #1 ++ vext.8 q3, q6, q7, #16 - \pb ++ // load b ++ vext.8 q10, q6, q7, #\pb ++ vext.8 q11, q7, q13, #\pb ++ bl \body_fn ++ vst1.8 {q0-q1}, [r0, :256], r2 ++ vst1.8 {q2-q3}, [r7, :256], r2 ++ bgt 1b ++ pop {r7,pc} ++.endm ++ ++.macro edge_16b_e0, body_fn, pb ++ sub r1, #8 ++ mov r6, lr ++1: vldmia r1, {d1-d4} ++ add r1, r3 ++ subs r12, #1 ++ vext.8 q0, q0, q1, #16 - \pb ++ vext.8 q2, q1, q2, #\pb ++ ++ bl \body_fn ++ vst1.8 {q0}, [r0, :128], r2 ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_8bx2_e0, body_fn, pb ++ add r6, r1, r3 ++ push {r7,lr} ++ sub r1, #8 ++ add r7, r0, r2 ++ lsl r2, #1 ++1: vldmia r1, {d1-d2} ++ vldmia r6, {d3-d4} ++ vldr d6, [r1, #16] ++ subs r12, #2 ++ vldr d7, [r6, #-8] ++ add r1, r1, r3, lsl #1 ++ vext.8 d0, d1, d2, #8 - \pb ++ add r6, r6, r3, lsl #1 ++ vext.8 d5, d3, d4, #\pb ++ vext.8 d4, d2, d6, #\pb ++ vext.8 d1, d7, d3, #8 - \pb ++ ++ bl \body_fn ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r7, :64], r2 ++ bgt 1b ++ pop {r7,pc} ++.endm ++ ++.macro edge_4bx4_e0, body_fn, pb ++ add r6, r1, r3 ++ push {r7,lr} ++ add r7, r0, r2 ++ lsl r2, #1 ++ ++ tst r1, #4 ++ bne 2f ++1: // r1 (and assumed r6) are 64-bit aligned ++ vldr d2, [r1] ++ vldr d0, [r1, #-8] ++ add r1, r1, r3, lsl #1 ++ vldr d20, [r6] ++ subs r12, #4 ++ vldr d18, [r6, #-8] ++ add r6, r6, r3, lsl #1 ++ vldr d3, [r1] ++ vshr.u64 d4, d2, #\pb * 8 ++ vldr d1, [r1, #-8] ++ add r1, r1, r3, lsl #1 ++ vldr d21, [r6] ++ vext.8 d0, d0, d2, #8 - \pb ++ vldr d19, [r6,#-8] ++ add r6, r6, r3, lsl #1 ++ vshr.u64 d22, d20, #\pb * 8 ++ vext.8 d18, d18, d20, #8 - \pb ++ vshr.u64 d5, d3, #\pb * 8 ++ vext.8 d1, d1, d3, #8 - \pb ++ vshr.u64 d23, d21, #\pb * 8 ++ vext.8 d19, d19, d21, #8 - \pb ++ vsli.64 q1, q10, #32 ++ vsli.64 q2, q11, #32 ++ vsli.64 q0, q9, #32 ++ ++ bl \body_fn ++ vst1.32 {d0[0]}, [r0, :32], r2 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vst1.32 {d1[0]}, [r0, :32], r2 ++ vst1.32 {d1[1]}, [r7, :32], r2 ++ bgt 1b ++ pop {r7,pc} ++ ++2: // r1 (and assumed r6) are 32-bit but not 64-bit aligned ++ vldr d20, [r1, #-4] ++ vldr d22, [r1, #4] ++ add r1, r1, r3, lsl #1 ++ vldr d2, [r6, #-4] ++ subs r12, #4 ++ vldr d4, [r6, #4] ++ add r6, r6, r3, lsl #1 ++ vldr d21, [r1, #-4] ++ vshl.i64 d18, d20, #\pb * 8 ++ vldr d23, [r1, #4] ++ add r1, r1, r3, lsl #1 ++ vldr d3, [r6, #-4] ++ vext.8 d22, d20, d22, #\pb ++ vldr d5, [r6, #4] ++ add r6, r6, r3, lsl #1 ++ vshl.i64 d0, d2, #\pb * 8 ++ vext.8 d4, d2, d4, #\pb ++ vshl.i64 d19, d21, #\pb * 8 ++ vext.8 d23, d21, d23, #\pb ++ vshl.i64 d1, d3, #\pb * 8 ++ vext.8 d5, d3, d5, #\pb ++ vsri.64 q1, q10, #32 ++ vsri.64 q0, q9, #32 ++ vsri.64 q2, q11, #32 ++ ++ bl \body_fn ++ vst1.32 {d0[0]}, [r0, :32], r2 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vst1.32 {d1[0]}, [r0, :32], r2 ++ vst1.32 {d1[1]}, [r7, :32], r2 ++ bgt 2b ++ pop {r7,pc} ++.endm ++ ++ ++.macro edge_64b_e1, body_fn ++ sub r1, r3 ++ push {lr} ++ add r6, r1, #32 ++ // load a ++ vld1.8 {q0-q1}, [r1, :256], r3 ++ vld1.8 {q2-q3}, [r6, :256], r3 ++ // load c ++ vld1.8 {q4-q5}, [r1, :256], r3 ++ vld1.8 {q6-q7}, [r6, :256], r3 ++1: // load b ++ vld1.8 {q8-q9}, [r1, :256], r3 ++ subs r12, #1 ++ vld1.8 {q10-q11}, [r6, :256], r3 ++ bl \body_fn ++ vstm r0, {q0-q3} ++ // copy c to a ++ vmov.64 q0, q4 ++ pld [r1, r3] ++ vmov.64 q1, q5 ++ it le ++ pople {lr} ++ vmov.64 q2, q6 ++ it le ++ bxle lr ++ vmov.64 q3, q7 ++ add r0, r0, r2 ++ // copy b to c ++ vmov.64 q4, q8 ++ vmov.64 q5, q9 ++ vmov.64 q6, q10 ++ vmov.64 q7, q11 ++ b 1b ++.endm ++ ++.macro edge_32bx2_e1, body_fn ++ sub r6, r1, r3 ++ vld1.8 {q2-q3}, [r1, :256], r3 ++ vld1.8 {q0-q1}, [r6, :256] ++ mov r6, lr ++ ++1: @ Given the data duplication here we could obviously do better than ++ @ using the generic body_fn but it almost certainly isn't worth it ++ vld1.8 {q8-q9}, [r1, :256], r3 ++ subs r12, #2 ++ vmov q4, q2 ++ vmov q5, q3 ++ vld1.8 {q10-q11}, [r1, :256], r3 ++ vmov q6, q8 ++ vmov q7, q9 ++ ++ bl \body_fn ++ ++ vst1.8 {q0-q1}, [r0, :256], r2 ++ // copy b to a ++ vmov q0, q8 ++ vmov q1, q9 ++ vst1.8 {q2-q3}, [r0, :256], r2 ++ vmov q2, q10 ++ it le ++ bxle r6 ++ vmov q3, q11 ++ b 1b ++.endm ++ ++.macro edge_16b_e1, body_fn ++ sub r6, r1, r3 ++ // load c ++ vld1.8 {q1}, [r1, :128], r3 ++ // load a ++ vld1.8 {q0}, [r6, :128] ++ mov r6, lr ++1: // load b ++ vld1.8 {q2}, [r1, :128], r3 ++ bl \body_fn ++ vst1.8 {q0}, [r0, :128], r2 ++ subs r12, #1 ++ // copy c to a ++ vmov.64 q0, q1 ++ it le ++ bxle r6 ++ // copy b to c ++ vmov.64 q1, q2 ++ b 1b ++.endm ++ ++.macro edge_8bx2_e1, body_fn ++ sub r6, r1, r3 ++ lsl r3, #1 ++ push {r7, lr} ++ vld1.8 {d1}, [r1, :64], r3 ++ vld1.8 {d0}, [r6, :64], r3 ++ add r7, r0, r2 ++ lsl r2, #1 ++1: @ Given the data duplication here we could obviously do better than ++ @ using the generic body_fn but it almost certainly isn't worth it ++ vld1.8 {d4}, [r6, :64], r3 ++ vmov d2, d1 ++ vld1.8 {d5}, [r1, :64], r3 ++ subs r12, #2 ++ vmov d3, d4 ++ ++ bl \body_fn ++ ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r7, :64], r2 ++ ++ // copy b to a ++ vmov q0, q2 ++ bgt 1b ++ pop {r7, pc} ++.endm ++ ++.macro edge_4bx4_e1, body_fn ++ sub r6, r1, r3 ++ lsl r3, #1 ++ push {r7, lr} ++ vld1.32 {d0[1]}, [r1, :32], r3 ++ add r7, r0, r2 ++ vld1.32 {d0[0]}, [r6, :32], r3 ++ lsl r2, #1 ++ vld1.32 {d4[1]}, [r1, :32], r3 ++ vld1.32 {d4[0]}, [r6, :32], r3 ++ vld1.32 {d5[1]}, [r1, :32], r3 ++ vld1.32 {d5[0]}, [r6, :32], r3 ++ vmov d1, d4 ++ vext.32 d2, d0, d4, #1 ++ subs r12, #4 ++ vmov d22, d5 ++ vext.32 d3, d4, d5, #1 ++ b 2f ++ ++1: vst1.32 {d0[0]}, [r0, :32], r2 ++ vext.32 d2, d22, d4, #1 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vmov d0, d22 ++ vst1.32 {d1[0]}, [r0, :32], r2 ++ vext.32 d3, d4, d5, #1 ++ vst1.32 {d1[1]}, [r7, :32], r2 ++ vmov d1, d4 ++ vmov d22, d5 ++2: @ Given the data duplication here we could probably do better than ++ @ using the generic body_fn but it almost certainly isn't worth it ++ bl \body_fn ++ ble 3f ++ vld1.32 {d4[0]}, [r6, :32], r3 ++ subs r12, #4 ++ vld1.32 {d4[1]}, [r1, :32], r3 ++ vld1.32 {d5[0]}, [r6, :32], r3 ++ vld1.32 {d5[1]}, [r1, :32], r3 ++ b 1b ++ ++3: vst1.32 {d0[0]}, [r0, :32], r2 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vst1.32 {d1[0]}, [r0, :32] ++ vst1.32 {d1[1]}, [r7, :32] ++ pop {r7, pc} ++.endm ++ ++.macro edge_64b_e2, body_fn, pb ++ push {lr} ++ sub r6, r1, r3 ++ // load c and a ++ vld1.8 {q4-q5}, [r1, :128] ++ vldr d25, [r6, #-8] ++ vldmia r6, {d16-d23} ++ vext.8 q0, q12, q8, #16 - \pb ++ add r6, r1, #32 ++ vext.8 q1, q8, q9, #16 - \pb ++ add r1, r1, r3 ++ vext.8 q2, q9, q10, #16 - \pb ++ vld1.8 {q6-q7}, [r6, :128] ++ sub r6, r1, r3 ++ vext.8 q3, q10, q11, #16 - \pb ++ ++1: // load b ++ vldmia r1, {d16-d24} ++ vext.8 q8, q8, q9, #\pb ++ pld [r1, r3] ++ vext.8 q9, q9, q10, #\pb ++ subs r12, #1 ++ vext.8 q10, q10, q11, #\pb ++ vext.8 q11, q11, q12, #\pb ++ bl \body_fn ++ // next a is mostly available in c ++ vldr d25, [r6, #-8] ++ vstmia r0, {q0-q3} ++ vext.8 q3, q6, q7, #16 - \pb ++ it le ++ pople {lr} ++ vext.8 q2, q5, q6, #16 - \pb ++ it le ++ bxle lr ++ vext.8 q1, q4, q5, #16 - \pb ++ add r6, r6, r3 ++ vext.8 q0, q12, q4, #16 - \pb ++ add r0, r0, r2 ++ // next c is mostly available in b ++ vldr d8, [r1] ++ vext.8 d9, d16, d17, #8 - \pb ++ vext.8 q5, q8, q9, #16 - \pb ++ add r1, r1, r3 ++ vext.8 q6, q9, q10, #16 - \pb ++ pld [r6, #-8] ++ vext.8 q7, q10, q11, #16 - \pb ++ b 1b ++.endm ++ ++.macro edge_32bx2_e2, body_fn, pb ++ sub r6, r1, r3 ++ push {r7, lr} ++ add r7, r0, r2 ++ lsl r2, #1 ++ // load a and first 32b of c ++ vld1.8 {q4-q5}, [r1, :256] ++ vldr d25, [r6, #-8] ++ vld1.8 {q13-q14}, [r6, :256] ++ vldr d31, [r1, #-8] ++ add r6, r6, r3, lsl #1 ++ vext.8 q0, q12, q13, #16 - \pb ++ add r1, r1, r3, lsl #1 ++ vext.8 q1, q13, q14, #16 - \pb ++ vext.8 q2, q15, q4, #16 - \pb ++ vext.8 q3, q4, q5, #16 - \pb ++1: ++ // load second 32b of c and second 32b of b ++ vldmia r6, {d12-d16} ++ vldmia r1, {d20-d24} ++ // first 32b of b is mostly available in second 32b of c ++ vext.8 q9, q7, q8, #\pb ++ subs r12, #2 ++ vext.8 q8, q6, q7, #\pb ++ vext.8 q10, q10, q11, #\pb ++ vext.8 q11, q11, q12, #\pb ++ ++ bl \body_fn ++ ++ vst1.8 {q0-q1}, [r0, :256], r2 ++ vst1.8 {q2-q3}, [r7, :256], r2 ++ ble 2f ++ ++ vldr d25, [r6, #-8] ++ add r6, r6, r3, lsl #1 ++ vldr d8, [r1] ++ vext.8 d9, d20, d21, #8 - \pb ++ vldr d31, [r1, #-8] ++ add r1, r1, r3, lsl #1 ++ // first 32b of a is mostly available in second 32b of c ++ vext.8 q1, q6, q7, #16 - \pb ++ vext.8 q0, q12, q6, #16 - \pb ++ // first 32b of c is mostly available in second 32b of b ++ vext.8 q5, q10, q11, #16 - \pb ++ // second 32b of a is mostly available in first 32b of c ++ vext.8 q2, q15, q4, #16 - \pb ++ vext.8 q3, q4, q5, #16 - \pb ++ b 1b ++ ++2: pop {r7, pc} ++.endm ++ ++.macro edge_16b_e2, body_fn, pb ++ push {lr} ++ sub r6, r1, r3 ++ vld1.8 {q1}, [r1, :128], r3 ++ vldr d19, [r6, #-8] ++ vld1.8 {q10}, [r6, :128], r3 ++ ++1: vldmia r1, {d4-d6} ++ vext.8 q0, q9, q10, #16 - \pb ++ subs r12, #1 ++ vext.8 q2, q2, q3, #\pb ++ bl \body_fn ++ vst1.8 {q0}, [r0, :128], r2 ++ ble 2f ++ vmov q10, q1 ++ vldr d2, [r1] ++ add r1, r1, r3 ++ vldr d19, [r6, #-8] ++ add r6, r6, r3 ++ vext.8 d3, d4, d5, #8 - \pb ++ b 1b ++ ++2: pop {pc} ++.endm ++ ++.macro edge_8bx2_e2, body_fn, pb ++ sub r6, r1, r3 ++ push {r7, lr} ++ add r7, r0, r2 ++ lsl r2, #1 ++ vldr d18, [r6, #-8] ++ vldr d19, [r6] ++ add r6, r6, r3, lsl #1 ++ vldr d20, [r1, #-8] ++ vldr d2, [r1] ++ add r1, r1, r3, lsl #1 ++ vldmia r6, {d3-d4} ++ vld1.8 {d21-d22}, [r1, :128] ++ ++1: vext.8 d0, d18, d19, #8 - \pb ++ vext.8 d4, d3, d4, #\pb ++ vext.8 d1, d20, d2, #8 - \pb ++ subs r12, #2 ++ vext.8 d5, d21, d22, #\pb ++ ++ bl \body_fn ++ ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r7, :64], r2 ++ ble 2f ++ ++ vldr d18, [r6, #-8] ++ add r6, r6, r3, lsl #1 ++ vldr d20, [r1, #-8] ++ vmov d19, d3 ++ vldr d2, [r1] ++ add r1, r1, r3, lsl #1 ++ vldmia r6, {d3-d4} ++ vld1.8 {d21-d22}, [r1, :128] ++ b 1b ++ ++2: pop {r7, pc} ++.endm ++ ++.macro edge_4bx4_e2, body_fn, pb ++ sub r6, r1, r3 ++ push {r7-r9, lr} ++ add r8, r1, r3 ++ sub r6, r6, #\pb ++ add r8, r8, #\pb ++ add r7, r0, r2 ++ lsl r2, #1 ++ ++1: vld1.32 {d0[0]}, [r6], r3 ++ subs r12, #4 ++ vld1.32 {d2[0]}, [r1], r3 ++ vld1.32 {d4[0]}, [r8], r3 ++ vld1.32 {d0[1]}, [r6], r3 ++ vld1.32 {d2[1]}, [r1], r3 ++ vld1.32 {d4[1]}, [r8], r3 ++ vld1.32 {d1[0]}, [r6], r3 ++ vld1.32 {d3[0]}, [r1], r3 ++ vld1.32 {d5[0]}, [r8], r3 ++ vld1.32 {d1[1]}, [r6], r3 ++ vld1.32 {d3[1]}, [r1], r3 ++ vld1.32 {d5[1]}, [r8], r3 ++ ++ bl \body_fn ++ ++ vst1.32 {d0[0]}, [r0, :32], r2 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vst1.32 {d1[0]}, [r0, :32], r2 ++ vst1.32 {d1[1]}, [r7, :32], r2 ++ bgt 1b ++ ++ pop {r7-r9,pc} ++.endm ++ ++.macro edge_64b_e3, body_fn, pb ++ push {lr} ++ sub r6, r1, r3 ++ // load c and a ++ vld1.8 {q4-q5}, [r1, :128] ++ vldmia r6, {d16-d24} ++ vext.8 q0, q8, q9, #\pb ++ add r6, r1, #32 ++ vext.8 q1, q9, q10, #\pb ++ add r1, r1, r3 ++ vext.8 q2, q10, q11, #\pb ++ vld1.8 {q6-q7}, [r6, :128] ++ sub r6, r1, r3 ++ vext.8 q3, q11, q12, #\pb ++ ++1: // load b ++ vldr d17, [r1, #-8] ++ vldmia r1, {d18-d25} ++ vext.8 q8, q8, q9, #16 - \pb ++ pld [r1, r3] ++ vext.8 q9, q9, q10, #16 - \pb ++ subs r12, #1 ++ vext.8 q10, q10, q11, #16 - \pb ++ vext.8 q11, q11, q12, #16 - \pb ++ bl \body_fn ++ // next a is mostly available in c ++ vldr d24, [r6, #64] ++ vstmia r0, {q0-q3} ++ vext.8 q0, q4, q5, #\pb ++ it le ++ pople {lr} ++ vext.8 q1, q5, q6, #\pb ++ it le ++ bxle lr ++ vext.8 q2, q6, q7, #\pb ++ add r6, r6, r3 ++ vext.8 q3, q7, q12, #\pb ++ add r0, r0, r2 ++ // next c is mostly available in b ++ vext.8 d14, d22, d23, #\pb ++ vldr d15, [r1, #56] ++ vext.8 q4, q8, q9, #\pb ++ add r1, r1, r3 ++ vext.8 q5, q9, q10, #\pb ++ vext.8 q6, q10, q11, #\pb ++ b 1b ++.endm ++ ++.macro edge_32bx2_e3, body_fn, pb ++ sub r6, r1, r3 ++ push {r7, lr} ++ add r7, r0, r2 ++ lsl r2, #1 ++ // load a and first 32b of c ++ vldmia r1, {d8-d12} ++ vldmia r6, {d24-d28} ++ vext.8 q2, q4, q5, #\pb ++ add r6, r6, r3, lsl #1 ++ vext.8 q3, q5, q6, #\pb ++ add r1, r1, r3, lsl #1 ++ vext.8 q0, q12, q13, #\pb ++ vext.8 q1, q13, q14, #\pb ++1: ++ // load second 32b of c and second 32b of b ++ vldr d25, [r6, #-8] ++ subs r12, #2 ++ vldmia r6, {d12-d15} ++ vldr d27, [r1, #-8] ++ vldmia r1, {d20-d23} ++ // first 32b of b is mostly available in second 32b of c ++ vext.8 q8, q12, q6, #16 - \pb ++ vext.8 q9, q6, q7, #16 - \pb ++ vext.8 q11, q10, q11, #16 - \pb ++ vext.8 q10, q13, q10, #16 - \pb ++ ++ bl \body_fn ++ ++ vst1.8 {q0-q1}, [r0, :256], r2 ++ vst1.8 {q2-q3}, [r7, :256], r2 ++ ble 2f ++ ++ vldr d24, [r6, #32] ++ add r6, r6, r3, lsl #1 ++ vldr d11, [r1, #24] ++ vext.8 d10, d22, d23, #\pb ++ vldr d30, [r1, #32] ++ add r1, r1, r3, lsl #1 ++ // first 32b of a is mostly available in second 32b of c ++ vext.8 q0, q6, q7, #\pb ++ vext.8 q1, q7, q12, #\pb ++ // first 32b of c is mostly available in second 32b of b ++ vext.8 q4, q10, q11, #\pb ++ // second 32b of a is mostly available in first 32b of c ++ vext.8 q3, q5, q15, #\pb ++ vext.8 q2, q4, q5, #\pb ++ b 1b ++ ++2: pop {r7, pc} ++.endm ++ ++.macro edge_16b_e3, body_fn, pb ++ push {lr} ++ sub r6, r1, r3 ++ vld1.8 {q1}, [r1, :128], r3 ++ vldmia r6, {d18-d20} ++ add r6, r6, r3 ++ ++1: vldr d5, [r1, #-8] ++ vld1.8 {q3}, [r1, :128] ++ subs r12, #1 ++ vext.8 q0, q9, q10, #\pb ++ vext.8 q2, q2, q3, #16 - \pb ++ bl \body_fn ++ vst1.8 {q0}, [r0, :128], r2 ++ ble 2f ++ vmov q9, q1 ++ vldr d3, [r1, #8] ++ add r1, r1, r3 ++ vldr d20, [r6, #16] ++ add r6, r6, r3 ++ vext.8 d2, d4, d5, #\pb ++ b 1b ++ ++2: pop {pc} ++.endm ++ ++.macro edge_8bx2_e3, body_fn, pb ++ sub r6, r1, r3 ++ push {r7, lr} ++ add r7, r0, r2 ++ lsl r2, #1 ++ vld1.8 {d18-d19}, [r6] ++ add r6, r6, r3, lsl #1 ++ vldr d20, [r1, #8] ++ vldr d2, [r1] ++ add r1, r1, r3, lsl #1 ++ vldr d4, [r6, #-8] ++ vldr d3, [r6] ++ vldr d21, [r1, #-8] ++ vldr d22, [r1] ++ ++1: vext.8 d0, d18, d19, #\pb ++ vext.8 d4, d4, d3, #8 - \pb ++ vext.8 d1, d2, d20, #\pb ++ subs r12, #2 ++ vext.8 d5, d21, d22, #8 - \pb ++ ++ bl \body_fn ++ ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r7, :64], r2 ++ ble 2f ++ ++ vldr d19, [r6, #8] ++ add r6, r6, r3, lsl #1 ++ vldr d20, [r1, #8] ++ vmov d18, d3 ++ vldr d2, [r1] ++ add r1, r1, r3, lsl #1 ++ vldr d4, [r6, #-8] ++ vldr d3, [r6] ++ vldr d21, [r1, #-8] ++ vldr d22, [r1] ++ b 1b ++ ++2: pop {r7, pc} ++.endm ++ ++.macro edge_4bx4_e3, body_fn, pb ++ @ e3 is the same as e2 but with the X offset reversed ++ edge_4bx4_e2 \body_fn, (-\pb) ++.endm ++ ++@ Jump table entry - if in neon mode the bottom bit must be set ++@ ? There is probably a real asm instruction to do this but I haven't found it ++.macro jent lab ++.if jent_pic ++@ Could use .short here but due to A32 not supporting ldrh [lsl#1] it is ++@ simpler and clearer in the code to stick with .word ++T .word (0 + \lab) - (4 + 98b) ++A .word (0 + \lab) - (8 + 98b) ++.else ++T .word 1 + \lab ++A .word \lab ++.endif ++.endm ++ ++.macro edge_64b_bodies, body_fn, pb ++ jent 0f ++ jent 10f ++ jent 20f ++ jent 30f ++ ++0: edge_64b_e0 \body_fn, \pb ++10: edge_64b_e1 \body_fn ++20: edge_64b_e2 \body_fn, \pb ++30: edge_64b_e3 \body_fn, \pb ++.endm ++ ++.macro edge_32bx2_bodies, body_fn, pb ++ jent 0f ++ jent 10f ++ jent 20f ++ jent 30f ++ ++0: edge_32bx2_e0 \body_fn, \pb ++10: edge_32bx2_e1 \body_fn ++20: edge_32bx2_e2 \body_fn, \pb ++30: edge_32bx2_e3 \body_fn, \pb ++.endm ++ ++.macro edge_16b_bodies, body_fn, pb ++ jent 0f ++ jent 10f ++ jent 20f ++ jent 30f ++ ++0: edge_16b_e0 \body_fn, \pb ++10: edge_16b_e1 \body_fn ++20: edge_16b_e2 \body_fn, \pb ++30: edge_16b_e3 \body_fn, \pb ++.endm ++ ++.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb ++ jent 0f ++ jent 10f ++ jent 20f ++ jent 30f ++ jent 5f ++ jent 15f ++ jent 25f ++ jent 35f ++ ++0: edge_32bx2_e0 \body_fn_64b, \pb ++10: edge_32bx2_e1 \body_fn_64b ++20: edge_32bx2_e2 \body_fn_64b, \pb ++30: edge_32bx2_e3 \body_fn_64b, \pb ++5: edge_16b_e0 \body_fn_16b, \pb ++15: edge_16b_e1 \body_fn_16b ++25: edge_16b_e2 \body_fn_16b, \pb ++35: edge_16b_e3 \body_fn_16b, \pb ++.endm ++ ++.macro edge_16b_8bx2_bodies, body_fn, pb ++ jent 0f ++ jent 10f ++ jent 20f ++ jent 30f ++ jent 5f ++ jent 15f ++ jent 25f ++ jent 35f ++ ++0: edge_16b_e0 \body_fn, \pb ++10: edge_16b_e1 \body_fn ++20: edge_16b_e2 \body_fn, \pb ++30: edge_16b_e3 \body_fn, \pb ++5: edge_8bx2_e0 \body_fn, \pb ++15: edge_8bx2_e1 \body_fn ++25: edge_8bx2_e2 \body_fn, \pb ++35: edge_8bx2_e3 \body_fn, \pb ++.endm ++ ++.macro edge_8bx2_4bx4_bodies, body_fn, pb ++ jent 0f ++ jent 10f ++ jent 20f ++ jent 30f ++ jent 5f ++ jent 15f ++ jent 25f ++ jent 35f ++ ++0: edge_8bx2_e0 \body_fn, \pb ++10: edge_8bx2_e1 \body_fn ++20: edge_8bx2_e2 \body_fn, \pb ++30: edge_8bx2_e3 \body_fn, \pb ++5: edge_4bx4_e0 \body_fn, \pb ++15: edge_4bx4_e1 \body_fn ++25: edge_4bx4_e2 \body_fn, \pb ++35: edge_4bx4_e3 \body_fn, \pb ++.endm ++ ++@ void ff_hevc_rpi_sao_edge_8_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_rpi_sao_edge_8_neon_8, export=1 ++ edge_16b_init 8, 0, 1, 99f ++99: ++ edge_8bx2_4bx4_bodies edge_16b_body_8, 1 ++endfunc ++ ++@ void ff_hevc_rpi_sao_edge_16_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_rpi_sao_edge_16_neon_8, export=1 ++ edge_16b_init 8, 0, 0, 99f ++99: ++ edge_16b_bodies edge_16b_body_8, 1 ++endfunc ++ ++@ void ff_hevc_rpi_sao_edge_32_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_rpi_sao_edge_32_neon_8, export=1 ++ edge_64b_init 8, 0, 0, 99f ++99: ++ edge_32bx2_bodies edge_64b_body_8, 1 ++endfunc ++ ++@ void ff_hevc_rpi_sao_edge_64_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_rpi_sao_edge_64_neon_8, export=1 ++ edge_64b_init 8, 0, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_8, 1 ++endfunc ++ ++@ ff_hevc_rpi_sao_edge_c_8_neon_8( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1 ++ edge_16b_init 8, 1, 1, 99f ++99: ++ edge_16b_8bx2_bodies edge_16b_body_8, 2 ++endfunc ++ ++@ ff_hevc_rpi_sao_edge_c_16_neon_8( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1 ++ edge_64b_init 8, 1, 0, 99f ++99: ++ edge_32bx2_bodies edge_64b_body_8, 2 ++endfunc ++ ++@ ff_hevc_rpi_sao_edge_c_32_neon_8( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1 ++ edge_64b_init 8, 1, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_8, 2 ++endfunc ++ ++@ void ff_hevc_rpi_sao_edge_8_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_rpi_sao_edge_8_neon_10, export=1 ++ edge_16b_init 10, 0, 1, 99f ++99: ++ edge_16b_8bx2_bodies edge_16b_body_16, 2 ++endfunc ++ ++@ void ff_hevc_rpi_sao_edge_16_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_rpi_sao_edge_16_neon_10, export=1 ++ edge_64b_init 10, 0, 0, 99f ++99: ++ edge_32bx2_bodies edge_64b_body_16, 2 ++endfunc ++ ++@ void ff_hevc_rpi_sao_edge_64_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++@ We simply split the 32 case into 2 vertical stripes ++@ and call the fns for w32 ++@ ++@ Calling code will always have src != dst so we don't have to worry ++@ about edge effects ++ ++function ff_hevc_rpi_sao_edge_64_neon_10, export=1 ++ edge_64b_init 10, 0, 1, 99f, xjump=1 ++endfunc ++ ++@ void ff_hevc_rpi_sao_edge_32_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_rpi_sao_edge_32_neon_10, export=1 ++ edge_64b_init 10, 0, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_16, 2 ++endfunc ++ ++@ ff_hevc_rpi_sao_edge_c_8_neon_10( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1 ++ edge_xxb_init 10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1 ++99: ++ edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4 ++endfunc ++ ++@ ff_hevc_rpi_sao_edge_c_32_neon_10( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1 ++ edge_64b_init 10, 1, 1, 99f, xjump=1 ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_edge_c_16_neon_10( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1 ++ edge_64b_init 10, 1, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_16, 4 ++endfunc ++ +diff --git a/libavcodec/arm/rpi_hevcpred_arm.h b/libavcodec/arm/rpi_hevcpred_arm.h +new file mode 100644 +index 0000000000..36a23a5bf9 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_arm.h +@@ -0,0 +1,28 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_ARM_HEVCPRED_ARM_H ++#define AVCODEC_ARM_HEVCPRED_ARM_H ++ ++#include "libavcodec/rpi_hevcpred.h" ++ ++void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth); ++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth); ++ ++#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_init_arm.c b/libavcodec/arm/rpi_hevcpred_init_arm.c +new file mode 100644 +index 0000000000..80724d4cf3 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_init_arm.c +@@ -0,0 +1,35 @@ ++/* ++ * Copyright (c) 2018 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/attributes.h" ++#include "libavutil/cpu.h" ++#include "libavutil/arm/cpu.h" ++ ++#include "libavcodec/rpi_hevcpred.h" ++#include "rpi_hevcpred_arm.h" ++ ++av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth) ++{ ++ int cpu_flags = av_get_cpu_flags(); ++ ++ if (have_neon(cpu_flags)) ++ ff_hevc_rpi_pred_init_neon(c, bit_depth); ++} ++ +diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c +new file mode 100644 +index 0000000000..21e7700174 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_init_neon.c +@@ -0,0 +1,210 @@ ++/* ++ * Copyright (c) 2018 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "rpi_hevcpred_arm.h" ++ ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32; ++ ++void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++ ++void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++ ++void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++ ++void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++ ++void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++ ++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth) ++{ ++ switch (bit_depth) ++ { ++ case 8: ++ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8; ++ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8; ++ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16; // Equivalent to c_4_neon_8 ++ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16; ++ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16; ++ ++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8; ++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8; ++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8; ++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8; ++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8; ++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8; ++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8; ++ ++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8; ++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8; ++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8; ++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8; ++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8; ++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8; ++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8; ++ ++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8; ++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8; ++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8; ++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8; ++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8; ++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8; ++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8; ++ ++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8; ++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8; ++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8; ++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8; ++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8; ++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8; ++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8; ++ ++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_8; ++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_8; ++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_8; ++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_8; ++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8; ++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8; ++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8; ++ break; ++ case 10: ++ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16; ++ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16; ++ c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16; ++ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32; ++ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32; ++ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32; ++ ++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10; ++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10; ++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10; ++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10; ++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10; ++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10; ++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10; ++ ++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10; ++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10; ++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10; ++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10; ++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10; ++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10; ++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10; ++ ++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10; ++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10; ++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10; ++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10; ++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10; ++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10; ++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10; ++ ++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10; ++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10; ++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10; ++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10; ++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10; ++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10; ++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10; ++ ++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_10; ++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_10; ++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_10; ++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_10; ++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10; ++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10; ++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10; ++ break; ++ default: ++ break; ++ } ++} ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S +new file mode 100644 +index 0000000000..fa8f67cf03 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S +@@ -0,0 +1,2984 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++/* ++ * General angular pred ++ * ++ * Horizontal (10) & Vertical (26) cases have their own file ++ * and are not dealt with properly here (luma filtering is missing) ++ * ++ * The inv_angle calculations are annoying - if it wasn't for the +128 ++ * rounding step then the result would simply be the loop counter :-( ++ */ ++ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++.text ++ ++@ Horizontal Patch functions ++@ These need a transpose before store so exist as smaller patches ++@ Patches can be called repeatedly without any intermediate setup ++@ to generate a horizontal block ++@ ++@ It is almost certainly the case that larger patch fns can be built ++@ and they would be a little faster, but we would still need the small ++@ fns and code size (or at least instruction cache size) is an issue ++@ given how much code we already have here ++ ++@ Generate 8x8 luma 8 patch ++@ ++@ r3 Out stride ++@ r4 Angle add ++@ r7 Inv angle (_up only) ++@ ++@ In/Out (updated) ++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width) ++@ r2 Left ptr - updated ++@ r10 Inv angle accumulator (_up only) ++@ r12 32 - angle frac (_down) or angle frac (_up) ++@ d0 Older reference samples ++@ d1=r8+r9 Newer reference samples ++@ d2 32 - angle frac ++@ d3 Angle frac ++@ q2 Partially computed next result (_up only) ++@ ++@ Temps ++@ r5 Loop counter ++@ r6 ++@ r7 (_down only) ++@ r11 (_up only) ++@ q2, q8-q11 ++ ++patch_h_down_8x8_8: ++ ldrd r8, r9, [r2] @ Left ++ rsb r12, r6, #32 ++ vmov d0, r8, r9 ++ vdup.8 d3, r6 ++ lsr r8, #8 ++ vdup.8 d2, r12 ++ orr r8, r8, r9, lsl #24 ++ ldr r9, [r2, #5]! ++ vmov d1, r8, r9 ++ // drop through... ++patch_h_down_8x8_8_continue: ++ mov r5, #8 ++1: ++ subs r12, r4 ++ vmull.u8 q2, d0, d2 ++ it mi ++ addmi r12, #32 ++ vmlal.u8 q2, d1, d3 ++ rsb r6, r12, #32 ++ vext.8 q8, q8, q9, #8 ++ itt mi ++ lsrmi r7, r8, #8 ++ vmovmi d0, r8, r9 ++ vdup.8 d2, r12 ++ vext.8 q9, q9, q10, #8 ++ it mi ++ orrmi r8, r7, r9, lsl #24 ++ vext.8 q10, q10, q11, #8 ++ it mi ++ ldrmi r9, [r2, #1]! ++ vmov d22, d23 ++ vrshrn.u16 d23, q2, #5 ++ it mi ++ vmovmi d1, r8, r9 ++ subs r5, #1 ++ vdup.8 d3, r6 ++ bne 1b ++ // drop through... ++store_tran_8x8_8: ++ vzip.8 d16, d17 ++ add r6, r0, r3 ++ vzip.8 d18, d19 ++ lsl r3, #1 ++ vzip.8 d20, d21 ++ add r5, r0, r3 ++ vzip.8 d22, d23 ++ vzip.16 q8, q9 ++ vzip.16 q10, q11 ++ vzip.32 q8, q10 ++ vzip.32 q9, q11 ++ vst1.8 {d16}, [r0]! ++ vst1.8 {d17}, [r6], r3 ++ vst1.8 {d20}, [r5], r3 ++ vst1.8 {d21}, [r6], r3 ++ vst1.8 {d18}, [r5], r3 ++ vst1.8 {d19}, [r6], r3 ++ vst1.8 {d22}, [r5] ++ asr r3, #1 ++ vst1.8 {d23}, [r6] ++ ++ bx lr ++ ++patch_h_up_8x8_8: ++ ldrd r8, r9, [r2] ++ rsb r6, r4, #32 ++ vmov d0, r8, r9 ++ vdup.8 d3, r4 ++ lsr r11, r8, #24 ++ vdup.8 d2, r6 ++ ldr r8, [r2, #-1]! ++ orr r9, r11, r9, lsl #8 ++ vmov d1, r8, r9 ++ mov r12, r4 ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++patch_h_up_8x8_8_continue: ++ mov r5, #8 ++1: ++ add r12, r4 ++ mov r11, #0 ++ cmp r12, #33 ++ it cs ++ addcs r10, r7 ++ vext.8 q8, q8, q9, #8 ++ itt cs ++ subcs r12, #32 ++ tstcs r10, #1<<31 ++ rsb r6, r12, #32 ++ it eq ++ asreq r11, r10, #8 ++ it cs ++ vmovcs d0, r8, r9 ++ vdup.8 d2, r6 ++ it cs ++ lsrcs r6, r8, #24 ++ vext.8 q9, q9, q10, #8 ++ itt cs ++ orrcs r9, r6, r9, lsl #8 ++ ldrbcs r11, [r1, r11] ++ vdup.8 d3, r12 ++ vext.8 q10, q10, q11, #8 ++ it hi ++ ldrbhi r11, [r2, #-1]! ++ vmov d22, d23 ++ vrshrn.u16 d23, q2, #5 ++ itt cs ++ orrcs r8, r11, r8, lsl #8 ++ vmovcs d1, r8, r9 ++ vmull.u8 q2, d0, d2 ++ subs r5, #1 ++ vmlal.u8 q2, d1, d3 ++ bne 1b ++ ++ b store_tran_8x8_8 ++ ++ ++.macro ADRT reg, val ++@ adr in T32 has enough range but not in A32 ++A adrl \reg, \val ++T adr \reg, \val ++.endm ++ ++@ ff_hevc_rpi_pred_angular_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_4_neon_8, export=1 ++ ldr r12, [sp] ++ push {r4-r8, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ ldr lr, [r2], #1 @ Top ++ rsb r12, r6, #32 ++ vmov s0, lr ++ vdup.8 d3, r6 ++ ldr lr, [r2], #1 ++ vdup.8 d2, r12 ++ vmov s2, lr ++ subs r12, r4 ++ vmull.u8 q2, d0, d2 ++ it mi ++ addmi r12, #32 ++ vmlal.u8 q2, d1, d3 ++ rsb r6, r12, #32 ++ itt mi ++ vmovmi s0, lr ++ ldrmi lr, [r2], #1 ++ vdup.8 d2, r12 ++ it mi ++ vmovmi s2, lr ++ vdup.8 d3, r6 ++ mov r5, #2 ++1: ++ vrshrn.u16 d20, q2, #5 ++ subs r12, r4 ++ vmull.u8 q2, d0, d2 ++ it mi ++ addmi r12, #32 ++ vmlal.u8 q2, d1, d3 ++ rsb r6, r12, #32 ++ vext.64 q8, q8, q9, #1 ++ it mi ++ vmovmi s0, lr ++ vext.64 q9, q9, q10, #1 ++ it mi ++ ldrmi lr, [r2], #1 ++ vdup.8 d2, r12 ++ it mi ++ vmovmi s2, lr ++ subs r5, #1 ++ vdup.8 d3, r6 ++ bne 1b ++ ++ vrshrn.u16 d20, q2, #5 ++ vmull.u8 q2, d0, d2 ++ add r12, r0, r3 ++ vmlal.u8 q2, d1, d3 ++ lsl r3, #1 ++ vext.64 q8, q8, q9, #1 ++ vext.64 q9, q9, q10, #1 ++ vrshrn.u16 d20, q2, #5 ++ ++98: ++ vst4.8 {d17[0], d18[0], d19[0], d20[0]}, [r0], r3 ++ vst4.8 {d17[1], d18[1], d19[1], d20[1]}, [r12], r3 ++ vst4.8 {d17[2], d18[2], d19[2], d20[2]}, [r0] ++ vst4.8 {d17[3], d18[3], d19[3], d20[3]}, [r12] ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ rsb r12, r6, #32 ++ ldr lr, [r2] @ Left ++ ldrb r2, [r2, #-1] @ Top-left ++ vmov s0, lr ++ vdup.8 d2, r12 ++ vdup.8 d3, r6 ++ orr lr, r2, lr, lsl #8 ++ vmov s2, lr ++ sub r8, r7, #128 ++ mov r5, #3 ++2: ++ vmull.u8 q2, d0, d2 ++ subs r12, r4 ++ vmlal.u8 q2, d1, d3 ++T it mi ++ addmi r12, #32 ++T asr r6, r8, #8 ++T it mi ++T ldrbmi r2, [r1, r6] ++A ldrbmi r2, [r1, r8, asr #8] ++ rsb r6, r12, #32 ++ vdup.8 d2, r12 ++ ittt mi ++ vmovmi s0, lr ++ orrmi lr, r2, lr, lsl #8 ++ vmovmi s2, lr ++ vrshrn.u16 d20, q2, #5 ++ vdup.8 d3, r6 ++ it mi ++ addmi r8, r7 ++ subs r5, #1 ++ vext.64 q8, q8, q9, #1 ++ vext.64 q9, q9, q10, #1 ++ bne 2b ++ ++ vmull.u8 q2, d0, d2 ++ add r12, r0, r3 ++ vmlal.u8 q2, d1, d3 ++ lsl r3, #1 ++ vrshrn.u16 d20, q2, #5 ++ b 98b ++ ++@ Left of vertical - works down left ++18: ++ ldrh r7, [r7] ++ rsb r12, r6, #32 ++ ldr lr, [r1] @ Top ++ ldrb r1, [r2, #-1] @ Top-left ++ vmov s0, lr ++ vdup.8 d2, r12 ++ vdup.8 d3, r6 ++ orr lr, r1, lr, lsl #8 ++ vmov s2, lr ++ sub r8, r7, #128 ++ mov r5, #3 ++2: ++ vmull.u8 q2, d0, d2 ++ subs r12, r4 ++ vmlal.u8 q2, d1, d3 ++T it mi ++ addmi r12, #32 ++T asr r6, r8, #8 ++T it mi ++T ldrbmi r1, [r2, r6] ++A ldrbmi r1, [r2, r8, asr #8] ++ rsb r6, r12, #32 ++ vdup.8 d2, r12 ++ ittt mi ++ vmovmi s0, lr ++ orrmi lr, r1, lr, lsl #8 ++ vmovmi s2, lr ++ vrshrn.u16 d4, q2, #5 ++ vdup.8 d3, r6 ++ it mi ++ addmi r8, r7 ++ subs r5, #1 ++ vst1.32 {d4[0]}, [r0], r3 ++ bne 2b ++ ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++ vrshrn.u16 d4, q2, #5 ++ vst1.32 {d4[0]}, [r0] ++ ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ ldr lr, [r1], #1 @ Top ++ rsb r12, r6, #32 ++ vmov s0, lr ++ vdup.8 d3, r6 ++ ldr lr, [r1], #1 ++ vdup.8 d2, r12 ++ vmov s2, lr ++ subs r12, r4 ++ vmull.u8 q2, d0, d2 ++ it mi ++ addmi r12, #32 ++ vmlal.u8 q2, d1, d3 ++ rsb r6, r12, #32 ++ itt mi ++ vmovmi s0, lr ++ ldrmi lr, [r1], #1 ++ vdup.8 d2, r12 ++ it mi ++ vmovmi s2, lr ++ vdup.8 d3, r6 ++ mov r5, #2 ++1: ++ vrshrn.u16 d6, q2, #5 ++ subs r12, r4 ++ vmull.u8 q2, d0, d2 ++ it mi ++ addmi r12, #32 ++ vmlal.u8 q2, d1, d3 ++ rsb r6, r12, #32 ++ vst1.32 {d6[0]}, [r0], r3 ++ itt mi ++ vmovmi s0, lr ++ ldrmi lr, [r1], #1 ++ vdup.8 d2, r12 ++ it mi ++ vmovmi s2, lr ++ subs r5, #1 ++ vdup.8 d3, r6 ++ bne 1b ++ ++ vrshrn.u16 d6, q2, #5 ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++ vst1.32 {d6[0]}, [r0], r3 ++ vrshrn.u16 d6, q2, #5 ++ vst1.32 {d6[0]}, [r0] ++ ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++ ++@ ff_hevc_rpi_pred_angular_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_8_neon_8, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ bl patch_h_down_8x8_8 ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ bl patch_h_up_8x8_8 ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ ldrd r8, r9, [r1] @ Top ++ rsb r12, r6, #32 ++ ldrb lr, [r2, #-1] @ Top-left ++ ldrh r7, [r7] ++ vmov d0, r8, r9 ++ lsl r9, r9, #8 ++ vdup.8 d2, r12 ++ orr r9, r9, r8, lsr #24 ++ orr r8, lr, r8, lsl #8 ++ vmov d1, r8, r9 ++ sub r1, r7, #128 ++ mov r5, #7 ++1: ++ vdup.8 d3, r6 ++ vmull.u8 q2, d0, d2 ++ subs r12, r12, r4 ++ vmlal.u8 q2, d1, d3 ++ ittt mi ++ addmi lr, r2, r1, asr #8 ++ addmi r12, r12, #32 ++ vmovmi d0, r8, r9 ++ rsb r6, r12, #32 ++ itt mi ++ lslmi r9, r9, #8 ++ ldrbmi lr, [lr] ++ vdup.8 d2, r12 ++ vrshrn.u16 d4, q2, #5 ++ itttt mi ++ orrmi r9, r9, r8, lsr #24 ++ orrmi r8, lr, r8, lsl #8 ++ vmovmi d1, r8, r9 ++ addmi r1, r1, r7 ++ subs r5, r5, #1 ++ vst1.8 {d4}, [r0], r3 ++ bne 1b ++ ++ vdup.8 d3, r6 ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++ vrshrn.u16 d4, q2, #5 ++ vst1.8 {d4}, [r0] ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ ldrd r8, r9, [r1] @ Top ++ rsb r12, r6, #32 ++ vmov d0, r8, r9 ++ vdup.8 d3, r6 ++ mov r5, #7 ++ lsr r8, #8 ++ vdup.8 d2, r12 ++ orr r8, r8, r9, lsl #24 ++ ldr r9, [r1, #5]! ++ vmov d1, r8, r9 ++1: ++ vmull.u8 q2, d0, d2 ++ subs r12, r4 ++ vmlal.u8 q2, d1, d3 ++ it mi ++ addmi r12, #32 ++ rsb r6, r12, #32 ++ itt mi ++ vmovmi d0, r8, r9 ++ lsrmi r8, #8 ++ vdup.8 d2, r12 ++ itt mi ++ orrmi r8, r8, r9, lsl #24 ++ ldrmi r9, [r1, #1]! ++ vrshrn.u16 d6, q2, #5 ++ it mi ++ vmovmi d1, r8, r9 ++ vdup.8 d3, r6 ++ subs r5, #1 ++ vst1.8 {d6}, [r0], r3 ++ bne 1b ++ ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++ vrshrn.u16 d6, q2, #5 ++ vst1.8 {d6}, [r0] ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_16_neon_8, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 @ save r2 - r1 unused by patch_down ++ ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8_continue ++ ++ add r2, r1, #8 @ restore r2, but 8 rows further down left ++ sub r0, #16 ++ mov r6, r4 ++ add r0, r0, r3, lsl #3 ++ ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8_continue ++ ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ ++ push {r2} ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8_continue ++ pop {r2} ++ ++ sub r0, #16 ++ mov r10, #-128 ++ add r2, #8 ++ add r0, r0, r3, lsl #3 ++ sub r10, r10, r7, lsl #3 ++ ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8_continue ++ ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.8 {q9}, [r1] ++ sub r1, r2, #1 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ vdup.8 d6, r6 ++ vext.8 q8, q9, q9, #15 ++ sub r8, r7, #128 ++ vld1.8 {d16[0]}, [r1] ++ vdup.8 d7, r12 ++ mov r5, #15 ++1: ++ vmull.u8 q0, d18, d7 ++ subs r12, r4 ++ vmlal.u8 q0, d16, d6 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d19, d7 ++ it cc ++ addcc r1, r2, r8, asr #8 ++ vmlal.u8 q1, d17, d6 ++ rsb r6, r12, #32 ++ vext.8 q10, q8, q8, #15 ++ sub r5, #1 ++ vld1.8 {d20[0]}, [r1] ++ it cc ++ addcc r8, r7 ++ vmov q11, q8 ++ teq r5, #0 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ vmull.u8 q0, d22, d7 ++ subs r12, r4 ++ vmlal.u8 q0, d20, d6 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d23, d7 ++ it cc ++ addcc r1, r2, r8, asr #8 ++ vmlal.u8 q1, d21, d6 ++ rsb r6, r12, #32 ++ vext.8 q8, q10, q10, #15 ++ sub r5, #1 ++ vld1.8 {d16[0]}, [r1] ++ it cc ++ addcc r8, r7 ++ vmov q9, q10 ++ teq r5, #0 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmull.u8 q0, d22, d7 ++ vmlal.u8 q0, d20, d6 ++ vmull.u8 q1, d23, d7 ++ vmlal.u8 q1, d21, d6 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++4: ++ bcc 3b ++5: ++ vmull.u8 q0, d18, d7 ++ vmlal.u8 q0, d16, d6 ++ vmull.u8 q1, d19, d7 ++ vmlal.u8 q1, d17, d6 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {q9}, [r1]! ++ rsb r12, r6, #32 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vext.8 q8, q9, q9, #1 ++ vld1.8 {d17[7]}, [r1]! ++ mov r5, #15 ++1: ++ vmull.u8 q0, d16, d6 ++ subs r12, r4 ++ vmlal.u8 q0, d18, d7 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d17, d6 ++ rsb r6, r12, #32 ++ vmlal.u8 q1, d19, d7 ++ sub r5, #1 ++ vext.8 q10, q8, q8, #1 ++ teq r5, #0 ++ vld1.8 {d21[7]}, [r1] ++ it cc ++ addcc r1, #1 ++ vmov q11, q8 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ vmull.u8 q0, d20, d6 ++ subs r12, r4 ++ vmlal.u8 q0, d22, d7 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d21, d6 ++ rsb r6, r12, #32 ++ vmlal.u8 q1, d23, d7 ++ sub r5, #1 ++ vext.8 q8, q10, q10, #1 ++ teq r5, #0 ++ vld1.8 {d17[7]}, [r1] ++ it cc ++ addcc r1, #1 ++ vmov q9, q10 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmull.u8 q0, d20, d6 ++ vmlal.u8 q0, d22, d7 ++ vmull.u8 q1, d21, d6 ++ vmlal.u8 q1, d23, d7 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++4: ++ bcc 3b ++5: ++ vmull.u8 q0, d16, d6 ++ vmlal.u8 q0, d18, d7 ++ vmull.u8 q1, d17, d6 ++ vmlal.u8 q1, d19, d7 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_32_neon_8, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r10, #4 ++ mov r1, r2 ++1: ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8_continue ++ bl patch_h_down_8x8_8_continue ++ bl patch_h_down_8x8_8_continue ++ ++ add r2, r1, #8 @ restore r2, but 8 rows further down left ++ add r1, r1, #8 ++ mov r6, r4 ++ sub r0, #32 ++ subs r10, #1 ++ add r0, r0, r3, lsl #3 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ vmov.i8 d6, #1<<2 ++1: ++ push {r2,r10} ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8_continue ++ bl patch_h_up_8x8_8_continue ++ bl patch_h_up_8x8_8_continue ++ pop {r2,r10} ++ ++ vmov r8, s12 ++ sub r0, #32 ++ add r2, #8 ++ add r0, r0, r3, lsl #3 ++ sub r10, r10, r7, lsl #3 ++ vshr.u8 d6, #1 ++ teq r8, #0 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.8 {q0-q1}, [r1] ++ sub r9, r2, #1 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ mov r8, #-128 ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++ mov r5, #32 ++1: ++ vld1.8 {d17[7]}, [r9] ++ add r8, r7 ++ vmov q2, q0 ++ vmov q3, q1 ++ add r9, r2, r8, asr #8 ++ vext.8 q1, q0, q1, #15 ++ vext.8 q0, q8, q0, #15 ++2: ++ vmull.u8 q10, d4, d19 ++ subs r12, r4 ++ vmlal.u8 q10, d0, d18 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q11, d5, d19 ++ rsb r6, r12, #32 ++ vmlal.u8 q11, d1, d18 ++ sub r5, #1 ++ vmull.u8 q12, d6, d19 ++ teq r5, #0 ++ vmlal.u8 q12, d2, d18 ++ vmull.u8 q13, d7, d19 ++ vmlal.u8 q13, d3, d18 ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++ vrshrn.u16 d20, q10, #5 ++ vrshrn.u16 d21, q11, #5 ++ vrshrn.u16 d22, q12, #5 ++ vrshrn.u16 d23, q13, #5 ++ vst1.8 {q10-q11}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ add r5, r1, #32 ++ vld1.8 {q0-q1}, [r1]! ++ rsb r12, r6, #32 ++ vld1.8 {d16[0]}, [r5] ++ mov r5, #32 ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++1: ++ vmov q2, q0 ++ add r1, #1 ++ vmov q3, q1 ++ vext.8 q0, q0, q1, #1 ++ vext.8 q1, q1, q8, #1 ++2: ++ vmull.u8 q10, d0, d18 ++ subs r12, r4 ++ vmlal.u8 q10, d4, d19 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q11, d1, d18 ++ rsb r6, r12, #32 ++ vmlal.u8 q11, d5, d19 ++ sub r5, #1 ++ vmull.u8 q12, d2, d18 ++ teq r5, #0 ++ vmlal.u8 q12, d6, d19 ++ vmull.u8 q13, d3, d18 ++ vmlal.u8 q13, d7, d19 ++ vld1.8 {d16[0]}, [r1] ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++ vrshrn.u16 d20, q10, #5 ++ vrshrn.u16 d21, q11, #5 ++ vrshrn.u16 d22, q12, #5 ++ vrshrn.u16 d23, q13, #5 ++ vst1.8 {q10-q11}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ Chroma 8 bit 4x4 patch fns ++ .text ++ ++patch_h_down_c_4x4_8: ++ ldrd r8, r9, [r2] @ Left ++ rsb r12, r6, #32 ++ vmov d0, r8, r9 ++ vdup.8 d3, r6 ++ lsr r8, #16 ++ vdup.8 d2, r12 ++ orr r8, r8, r9, lsl #16 ++ ldr r9, [r2, #6]! ++ vmov d1, r8, r9 ++ // drop through... ++patch_h_down_c_4x4_8_continue: ++ mov r5, #4 ++1: ++ subs r12, r4 ++ vmull.u8 q2, d0, d2 ++ it mi ++ addmi r12, #32 ++ vmlal.u8 q2, d1, d3 ++ rsb r6, r12, #32 ++ vext.8 q8, q8, q9, #8 ++ it mi ++ lsrmi r7, r8, #16 ++ vmov d18, d19 ++ it mi ++ vmovmi d0, r8, r9 ++ vdup.8 d2, r12 ++ it mi ++ orrmi r8, r7, r9, lsl #16 ++ vrshrn.u16 d19, q2, #5 ++ itt mi ++ ldrmi r9, [r2, #2]! ++ vmovmi d1, r8, r9 ++ subs r5, #1 ++ vdup.8 d3, r6 ++ bne 1b ++ // drop through... ++store_tran_c_4x4_8: ++ vzip.16 d16, d17 ++ add r6, r0, r3 ++ vzip.16 d18, d19 ++ lsl r3, #1 ++ vzip.32 q8, q9 ++ add r5, r0, r3 ++ vst1.16 {d16}, [r0]! ++ vst1.16 {d17}, [r6], r3 ++ vst1.16 {d18}, [r5] ++ asr r3, #1 ++ vst1.16 {d19}, [r6] ++ ++ bx lr ++ ++patch_h_up_c_4x4_8: ++ ldrd r8, r9, [r2] ++ rsb r6, r4, #32 ++ vmov d0, r8, r9 ++ vdup.8 d3, r4 ++ lsr r11, r8, #16 ++ vdup.8 d2, r6 ++ ldr r8, [r2, #-2]! ++ orr r9, r11, r9, lsl #16 ++ vmov d1, r8, r9 ++ mov r12, r4 ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++patch_h_up_c_4x4_8_continue: ++ mov r5, #4 ++1: ++ add r12, r4 ++ cmp r12, #33 ++ it cs ++ addcs r10, r7 ++ mov r11, #0 ++ itt cs ++ subcs r12, #32 ++ tstcs r10, #1<<31 ++ rsb r6, r12, #32 ++ it eq ++ asreq r11, r10, #7 ++ it cs ++ vmovcs d0, r8, r9 ++ it eq ++ biceq r11, #1 ++ vdup.8 d2, r6 ++ it cs ++ lsrcs r6, r8, #16 ++ vdup.8 d3, r12 ++ vext.8 q8, q8, q9, #8 ++ itt cs ++ orrcs r9, r6, r9, lsl #16 ++ ldrhcs r11, [r1, r11] ++ vmov d18, d19 ++ it hi ++ ldrhhi r11, [r2, #-2]! ++ vrshrn.u16 d19, q2, #5 ++ itt cs ++ orrcs r8, r11, r8, lsl #16 ++ vmovcs d1, r8, r9 ++ vmull.u8 q2, d0, d2 ++ subs r5, #1 ++ vmlal.u8 q2, d1, d3 ++ bne 1b ++ ++ b store_tran_c_4x4_8 ++ ++ ++@ ff_hevc_rpi_pred_angular_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ bl patch_h_down_c_4x4_8 ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ bl patch_h_up_c_4x4_8 ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ ldrd r8, r9, [r1] @ Top ++ rsb r12, r6, #32 ++ ldrh lr, [r2, #-2] @ Top-left ++ ldrh r7, [r7] ++ vmov d0, r8, r9 ++ lsl r9, r9, #16 ++ vdup.8 d2, r12 ++ orr r9, r9, r8, lsr #16 ++ orr r8, lr, r8, lsl #16 ++ vmov d1, r8, r9 ++ sub r1, r7, #128 ++ mov r5, #3 ++1: ++ vdup.8 d3, r6 ++ vmull.u8 q2, d0, d2 ++ subs r12, r12, r4 ++ vmlal.u8 q2, d1, d3 ++ itttt mi ++ addmi lr, r2, r1, asr #7 ++ bicmi lr, #1 ++ addmi r12, r12, #32 ++ vmovmi d0, r8, r9 ++ rsb r6, r12, #32 ++ itt mi ++ lslmi r9, r9, #16 ++ ldrhmi lr, [lr] ++ vdup.8 d2, r12 ++ vrshrn.u16 d4, q2, #5 ++ itttt mi ++ orrmi r9, r9, r8, lsr #16 ++ orrmi r8, lr, r8, lsl #16 ++ vmovmi d1, r8, r9 ++ addmi r1, r1, r7 ++ subs r5, r5, #1 ++ vst1.16 {d4}, [r0], r3 ++ bne 1b ++ ++ vdup.8 d3, r6 ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++ vrshrn.u16 d4, q2, #5 ++ vst1.16 {d4}, [r0] ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ ldrd r8, r9, [r1] @ Top ++ rsb r12, r6, #32 ++ vmov d0, r8, r9 ++ vdup.8 d3, r6 ++ mov r5, #3 ++ lsr r8, #16 ++ vdup.8 d2, r12 ++ orr r8, r8, r9, lsl #16 ++ ldr r9, [r1, #6]! ++ vmov d1, r8, r9 ++1: ++ vmull.u8 q2, d0, d2 ++ subs r12, r4 ++ vmlal.u8 q2, d1, d3 ++ it mi ++ addmi r12, #32 ++ rsb r6, r12, #32 ++ itt mi ++ vmovmi d0, r8, r9 ++ lsrmi r8, #16 ++ vdup.8 d2, r12 ++ itt mi ++ orrmi r8, r8, r9, lsl #16 ++ ldrmi r9, [r1, #2]! ++ vrshrn.u16 d6, q2, #5 ++ it mi ++ vmovmi d1, r8, r9 ++ vdup.8 d3, r6 ++ subs r5, #1 ++ vst1.16 {d6}, [r0], r3 ++ bne 1b ++ ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++ vrshrn.u16 d6, q2, #5 ++ vst1.16 {d6}, [r0] ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 @ save r2 - r1 unused by patch_down ++ ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8_continue ++ ++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left ++ sub r0, #16 ++ mov r6, r4 ++ add r0, r0, r3, lsl #2 ++ ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8_continue ++ ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ ++ push {r2} ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8_continue ++ pop {r2} ++ ++ sub r0, #16 ++ mov r10, #-128 ++ add r2, #8 ++ add r0, r0, r3, lsl #2 ++ sub r10, r10, r7, lsl #2 ++ ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8_continue ++ ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.8 {q9}, [r1] ++ sub r1, r2, #2 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ vdup.8 d6, r6 ++ vext.8 q8, q9, q9, #14 ++ sub r8, r7, #128 ++ vld1.16 {d16[0]}, [r1] ++ vdup.8 d7, r12 ++ mov r5, #7 ++1: ++ subs r12, r4 ++ vmull.u8 q0, d18, d7 ++ it cc ++ asrcc r1, r8, #8 ++ vmlal.u8 q0, d16, d6 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d19, d7 ++ it cc ++ addcc r1, r2, r1, lsl #1 ++ vmlal.u8 q1, d17, d6 ++ rsb r6, r12, #32 ++ vext.8 q10, q8, q8, #14 ++ sub r5, #1 ++ vld1.16 {d20[0]}, [r1] ++ it cc ++ addcc r8, r7 ++ vmov q11, q8 ++ teq r5, #0 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ subs r12, r4 ++ vmull.u8 q0, d22, d7 ++ it cc ++ asrcc r1, r8, #8 ++ vmlal.u8 q0, d20, d6 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d23, d7 ++ it cc ++ addcc r1, r2, r1, lsl #1 ++ vmlal.u8 q1, d21, d6 ++ rsb r6, r12, #32 ++ vext.8 q8, q10, q10, #14 ++ sub r5, #1 ++ vld1.16 {d16[0]}, [r1] ++ it cc ++ addcc r8, r7 ++ vmov q9, q10 ++ teq r5, #0 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmull.u8 q0, d22, d7 ++ vmlal.u8 q0, d20, d6 ++ vmull.u8 q1, d23, d7 ++ vmlal.u8 q1, d21, d6 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++4: ++ bcc 3b ++5: ++ vmull.u8 q0, d18, d7 ++ vmlal.u8 q0, d16, d6 ++ vmull.u8 q1, d19, d7 ++ vmlal.u8 q1, d17, d6 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {q9}, [r1]! ++ rsb r12, r6, #32 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vext.8 q8, q9, q9, #2 ++ vld1.16 {d17[3]}, [r1]! ++ mov r5, #7 ++1: ++ vmull.u8 q0, d16, d6 ++ subs r12, r4 ++ vmlal.u8 q0, d18, d7 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d17, d6 ++ rsb r6, r12, #32 ++ vmlal.u8 q1, d19, d7 ++ sub r5, #1 ++ vext.8 q10, q8, q8, #2 ++ teq r5, #0 ++ vld1.16 {d21[3]}, [r1] ++ it cc ++ addcc r1, #2 ++ vmov q11, q8 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ vmull.u8 q0, d20, d6 ++ subs r12, r4 ++ vmlal.u8 q0, d22, d7 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d21, d6 ++ rsb r6, r12, #32 ++ vmlal.u8 q1, d23, d7 ++ sub r5, #1 ++ vext.8 q8, q10, q10, #2 ++ teq r5, #0 ++ vld1.16 {d17[3]}, [r1] ++ it cc ++ addcc r1, #2 ++ vmov q9, q10 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmull.u8 q0, d20, d6 ++ vmlal.u8 q0, d22, d7 ++ vmull.u8 q1, d21, d6 ++ vmlal.u8 q1, d23, d7 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++4: ++ bcc 3b ++5: ++ vmull.u8 q0, d16, d6 ++ vmlal.u8 q0, d18, d7 ++ vmull.u8 q1, d17, d6 ++ vmlal.u8 q1, d19, d7 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r10, #4 ++ mov r1, r2 ++1: ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8_continue ++ bl patch_h_down_c_4x4_8_continue ++ bl patch_h_down_c_4x4_8_continue ++ ++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left ++ add r1, r1, #4*2 ++ mov r6, r4 ++ sub r0, #32 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ vmov.i8 d6, #1<<2 ++1: ++ push {r2, r10} ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8_continue ++ bl patch_h_up_c_4x4_8_continue ++ bl patch_h_up_c_4x4_8_continue ++ pop {r2, r10} ++ ++ vmov r8, s12 ++ sub r0, #32 ++ add r2, #8 ++ add r0, r0, r3, lsl #2 ++ sub r10, r10, r7, lsl #2 ++ vshr.u8 d6, #1 ++ teq r8, #0 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.8 {q0-q1}, [r1] ++ sub r9, r2, #2 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ mov r8, #-128 ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++ mov r5, #16 ++1: ++ vld1.16 {d17[3]}, [r9] ++ add r8, r7 ++ vmov q2, q0 ++ vmov q3, q1 ++ asr r9, r8, #8 ++ vext.8 q1, q0, q1, #14 ++ add r9, r2, r9, lsl #1 ++ vext.8 q0, q8, q0, #14 ++2: ++ vmull.u8 q10, d4, d19 ++ subs r12, r4 ++ vmlal.u8 q10, d0, d18 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q11, d5, d19 ++ rsb r6, r12, #32 ++ vmlal.u8 q11, d1, d18 ++ sub r5, #1 ++ vmull.u8 q12, d6, d19 ++ teq r5, #0 ++ vmlal.u8 q12, d2, d18 ++ vmull.u8 q13, d7, d19 ++ vmlal.u8 q13, d3, d18 ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++ vrshrn.u16 d20, q10, #5 ++ vrshrn.u16 d21, q11, #5 ++ vrshrn.u16 d22, q12, #5 ++ vrshrn.u16 d23, q13, #5 ++ vst1.8 {q10-q11}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ add r5, r1, #32 ++ vld1.8 {q0-q1}, [r1]! ++ rsb r12, r6, #32 ++ vld1.16 {d16[0]}, [r5] ++ mov r5, #16 ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++1: ++ vmov q2, q0 ++ add r1, #2 ++ vmov q3, q1 ++ vext.8 q0, q0, q1, #2 ++ vext.8 q1, q1, q8, #2 ++2: ++ vmull.u8 q10, d0, d18 ++ subs r12, r4 ++ vmlal.u8 q10, d4, d19 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q11, d1, d18 ++ rsb r6, r12, #32 ++ vmlal.u8 q11, d5, d19 ++ sub r5, #1 ++ vmull.u8 q12, d2, d18 ++ teq r5, #0 ++ vmlal.u8 q12, d6, d19 ++ vmull.u8 q13, d3, d18 ++ vmlal.u8 q13, d7, d19 ++ vld1.16 {d16[0]}, [r1] ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++ vrshrn.u16 d20, q10, #5 ++ vrshrn.u16 d21, q11, #5 ++ vrshrn.u16 d22, q12, #5 ++ vrshrn.u16 d23, q13, #5 ++ vst1.8 {q10-q11}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++@------------------------------------------------------------------------------ ++@ Data ++ ++ .text ++ .balign 64 ++angle_2: ++ .byte 32 ++ .byte 26, 21, 17, 13, 9, 5, 2, 0 ++ @ Sign inverted from standards table ++ .byte 2, 5, 9, 13, 17, 21, 26, 32 ++ .byte 26, 21, 17, 13, 9, 5, 2, 0 ++ @ Standard sign ++ .byte 2, 5, 9, 13, 17, 21, 26, 32 ++ ++ .balign 2 ++ ++ @ Sign inverted from standards table ++inv_angle: ++ .short 4096, 1638, 910, 630, 482, 390, 315 ++ .short 256 ++ .short 315, 390, 482, 630, 910, 1638, 4096 ++ ++@------------------------------------------------------------------------------ ++@ ++@ 10 bit fns ++@ Should work for 9 & 11 bit as there is no actual bit-depth specific code ++@ but runs out of register width for 12+ bit ++ ++ .text ++ .balign 64 ++ ++patch_h_down_4x4_10: ++ ldrd r8, r9, [r2] @ Left ++ rsb r12, r6, #32 ++ vmov d0, r8, r9 ++ vdup.16 d3, r6 ++ lsr r8, #16 ++ vdup.16 d2, r12 ++ orr r8, r8, r9, lsl #16 ++ ldr r9, [r2, #6]! ++ vmov d1, r8, r9 ++ // drop through... ++patch_h_down_4x4_10_continue: ++ mov r5, #4 ++1: ++ subs r12, r4 ++ vmul.u16 d4, d0, d2 ++ it mi ++ addmi r12, #32 ++ vmla.u16 d4, d1, d3 ++ rsb r6, r12, #32 ++ vext.16 q8, q8, q9, #4 ++ it mi ++ lsrmi r7, r8, #16 ++ vmov d18, d19 ++ it mi ++ vmovmi d0, r8, r9 ++ vdup.16 d2, r12 ++ it mi ++ orrmi r8, r7, r9, lsl #16 ++ vrshr.u16 d19, d4, #5 ++ itt mi ++ ldrmi r9, [r2, #2]! ++ vmovmi d1, r8, r9 ++ subs r5, #1 ++ vdup.16 d3, r6 ++ bne 1b ++ // drop through... ++store_tran_4x4_10: ++ vzip.16 d16, d17 ++ add r6, r0, r3 ++ vzip.16 d18, d19 ++ lsl r3, #1 ++ vzip.32 q8, q9 ++ add r5, r0, r3 ++ vst1.16 {d16}, [r0]! ++ vst1.16 {d17}, [r6], r3 ++ vst1.16 {d18}, [r5] ++ asr r3, #1 ++ vst1.16 {d19}, [r6] ++ ++ bx lr ++ ++patch_h_up_4x4_10: ++ ldrd r8, r9, [r2] ++ rsb r6, r4, #32 ++ vmov d0, r8, r9 ++ vdup.16 d3, r4 ++ lsr r11, r8, #16 ++ vdup.16 d2, r6 ++ ldr r8, [r2, #-2]! ++ orr r9, r11, r9, lsl #16 ++ vmov d1, r8, r9 ++ mov r12, r4 ++ vmul.u16 d4, d0, d2 ++ vmla.u16 d4, d1, d3 ++patch_h_up_4x4_10_continue: ++ mov r5, #4 ++1: ++ add r12, r4 ++ cmp r12, #33 ++ it cs ++ addcs r10, r7 ++ mov r11, #0 ++ itt cs ++ subcs r12, #32 ++ tstcs r10, #1<<31 ++ rsb r6, r12, #32 ++ it eq ++ asreq r11, r10, #7 ++ it cs ++ vmovcs d0, r8, r9 ++ it eq ++ biceq r11, #1 ++ vdup.16 d2, r6 ++ it cs ++ lsrcs r6, r8, #16 ++ vdup.16 d3, r12 ++ vext.16 q8, q8, q9, #4 ++ itt cs ++ orrcs r9, r6, r9, lsl #16 ++ ldrhcs r11, [r1, r11] ++ vmov d18, d19 ++ it hi ++ ldrhhi r11, [r2, #-2]! ++ vrshr.u16 d19, d4, #5 ++ itt cs ++ orrcs r8, r11, r8, lsl #16 ++ vmovcs d1, r8, r9 ++ vmul.u16 d4, d0, d2 ++ subs r5, #1 ++ vmla.u16 d4, d1, d3 ++ bne 1b ++ ++ b store_tran_4x4_10 ++ ++ ++@ ff_hevc_rpi_pred_angular_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_4_neon_10, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ bl patch_h_down_4x4_10 ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ bl patch_h_up_4x4_10 ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ ldrd r8, r9, [r1] @ Top ++ rsb r12, r6, #32 ++ ldrh lr, [r2, #-2] @ Top-left ++ ldrh r7, [r7] ++ vmov d0, r8, r9 ++ lsl r9, r9, #16 ++ vdup.16 d2, r12 ++ orr r9, r9, r8, lsr #16 ++ orr r8, lr, r8, lsl #16 ++ vmov d1, r8, r9 ++ sub r1, r7, #128 ++ mov r5, #3 ++1: ++ sel lr, lr, lr @ force pipeline 0 on Cortex-A53 ++ vdup.16 d3, r6 ++ vmul.u16 d4, d0, d2 ++ subs r12, r12, r4 ++ vmla.u16 d4, d1, d3 ++ itttt mi ++ addmi lr, r2, r1, asr #7 ++ bicmi lr, #1 ++ addmi r12, r12, #32 ++ vmovmi d0, r8, r9 ++ rsb r6, r12, #32 ++ itt mi ++ lslmi r9, r9, #16 ++ ldrhmi lr, [lr] ++ vdup.16 d2, r12 ++ vrshr.u16 d4, d4, #5 ++ itttt mi ++ orrmi r9, r9, r8, lsr #16 ++ orrmi r8, lr, r8, lsl #16 ++ vmovmi d1, r8, r9 ++ addmi r1, r1, r7 ++ subs r5, r5, #1 ++ vst1.16 {d4}, [r0], r3 ++ bne 1b ++ ++ vdup.16 d3, r6 ++ nop @ force next insn into pipeline 0 to enable ++ vmul.u16 d4, d0, d2 @ vmla to execute back-to-back on Cortex-A53 ++ vmla.u16 d4, d1, d3 ++ vrshr.u16 d4, d4, #5 ++ vst1.16 {d4}, [r0] ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ ldrd r8, r9, [r1] @ Top ++ rsb r12, r6, #32 ++ vmov d0, r8, r9 ++ vdup.16 d3, r6 ++ lsr r8, #16 ++ vdup.16 d2, r12 ++ orr r8, r8, r9, lsl #16 ++ ldr r9, [r1, #6]! ++ vmov d1, r8, r9 ++ mov r5, #3 ++1: ++ vmul.u16 d4, d0, d2 ++ subs r12, r4 ++ vmla.u16 d4, d1, d3 ++ it mi ++ addmi r12, #32 ++ rsb r6, r12, #32 ++ itt mi ++ vmovmi d0, r8, r9 ++ lsrmi r8, #16 ++ vdup.16 d2, r12 ++ itt mi ++ orrmi r8, r8, r9, lsl #16 ++ ldrmi r9, [r1, #2]! ++ vrshr.u16 d4, d4, #5 ++ it mi ++ vmovmi d1, r8, r9 ++ vdup.16 d3, r6 ++ subs r5, #1 ++ vst1.16 {d4}, [r0], r3 ++ bne 1b ++ ++ vmul.u16 d4, d0, d2 ++ vmla.u16 d4, d1, d3 ++ vrshr.u16 d4, d4, #5 ++ vst1.16 {d4}, [r0] ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_8_neon_10, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 @ save r2 - r1 unused by patch_down ++ ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10_continue ++ ++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left ++ sub r0, #16 ++ mov r6, r4 ++ add r0, r0, r3, lsl #2 ++ ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10_continue ++ ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ ++ push {r2} ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10_continue ++ pop {r2} ++ ++ sub r0, #16 ++ mov r10, #-128 ++ add r2, #8 ++ add r0, r0, r3, lsl #2 ++ sub r10, r10, r7, lsl #2 ++ ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10_continue ++ ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.16 {q9}, [r1] ++ sub r1, r2, #2 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ vdup.16 q2, r6 ++ vext.16 q8, q9, q9, #7 ++ sub r8, r7, #128 ++ vld1.16 {d16[0]}, [r1] ++ vdup.16 q3, r12 ++ mov r5, #7 ++1: ++ vmul.u16 q0, q9, q3 ++ subs r12, r4 ++ vmla.u16 q0, q8, q2 ++ ittt cc ++ asrcc r1, r8, #8 ++ addcc r12, #32 ++ addcc r1, r2, r1, lsl #1 ++ vext.16 q10, q8, q8, #7 ++ rsb r6, r12, #32 ++ vmov q11, q8 ++ sub r5, #1 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r8, r7 ++ vld1.16 {d20[0]}, [r1] ++ teq r5, #0 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ vmul.u16 q0, q11, q3 ++ subs r12, r4 ++ vmla.u16 q0, q10, q2 ++ ittt cc ++ asrcc r1, r8, #8 ++ addcc r12, #32 ++ addcc r1, r2, r1, lsl #1 ++ vext.16 q8, q10, q10, #7 ++ rsb r6, r12, #32 ++ vmov q9, q10 ++ sub r5, #1 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r8, r7 ++ vld1.16 {d16[0]}, [r1] ++ teq r5, #0 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmul.u16 q0, q11, q3 ++ vmla.u16 q0, q10, q2 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++4: ++ bcc 3b ++5: ++ vmul.u16 q0, q9, q3 ++ vmla.u16 q0, q8, q2 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.16 {q9}, [r1]! ++ rsb r12, r6, #32 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vext.16 q8, q9, q9, #1 ++ vld1.16 {d17[3]}, [r1]! ++ mov r5, #7 ++1: ++ vmul.u16 q0, q8, q2 ++ subs r12, r4 ++ vmla.u16 q0, q9, q3 ++ it cc ++ addcc r12, #32 ++ vext.16 q10, q8, q8, #1 ++ rsb r6, r12, #32 ++ vld1.16 {d21[3]}, [r1] ++ sub r5, #1 ++ vmov q11, q8 ++ teq r5, #0 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r1, #2 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ vmul.u16 q0, q10, q2 ++ subs r12, r4 ++ vmla.u16 q0, q11, q3 ++ it cc ++ addcc r12, #32 ++ vext.16 q8, q10, q10, #1 ++ rsb r6, r12, #32 ++ vld1.16 {d17[3]}, [r1] ++ sub r5, #1 ++ vmov q9, q10 ++ teq r5, #0 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r1, #2 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmul.u16 q0, q10, q2 ++ vmla.u16 q0, q11, q3 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++4: ++ bcc 3b ++5: ++ vmul.u16 q0, q8, q2 ++ vmla.u16 q0, q9, q3 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_16_neon_10, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r10, #4 ++ mov r1, r2 ++1: ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ ++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left ++ add r1, r1, #4*2 ++ mov r6, r4 ++ sub r0, #32 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ vmov.i8 d6, #1<<2 ++1: ++ push {r2, r10} ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ pop {r2, r10} ++ ++ vmov r8, s12 ++ sub r0, #32 ++ add r2, #8 ++ add r0, r0, r3, lsl #2 ++ sub r10, r10, r7, lsl #2 ++ vshr.u8 d6, #1 ++ teq r8, #0 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.16 {q0-q1}, [r1] ++ sub r9, r2, #2 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ mov r8, #-128 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++ mov r5, #16 ++1: ++ vld1.16 {d17[3]}, [r9] ++ add r8, r7 ++ vmov q2, q0 ++ vmov q3, q1 ++ asr r9, r8, #8 ++ vext.16 q1, q0, q1, #7 ++ add r9, r2, r9, lsl #1 ++ vext.16 q0, q8, q0, #7 ++2: ++ vmul.u16 q11, q2, q10 ++ subs r12, r4 ++ vmla.u16 q11, q0, q9 ++ it cc ++ addcc r12, #32 ++ vmul.u16 q12, q3, q10 ++ rsb r6, r12, #32 ++ vmla.u16 q12, q1, q9 ++ sub r5, #1 ++ teq r5, #0 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++ vrshr.u16 q11, q11, #5 ++ vrshr.u16 q12, q12, #5 ++ vst1.16 {q11-q12}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ add r5, r1, #32 ++ vld1.16 {q0-q1}, [r1]! ++ rsb r12, r6, #32 ++ vld1.16 {d16[0]}, [r5] ++ mov r5, #16 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++1: ++ vmov q2, q0 ++ add r1, #2 ++ vmov q3, q1 ++ vext.16 q0, q0, q1, #1 ++ vext.16 q1, q1, q8, #1 ++2: ++ vmul.u16 q11, q0, q9 ++ subs r12, r4 ++ vmla.u16 q11, q2, q10 ++ it cc ++ addcc r12, #32 ++ vmul.u16 q12, q1, q9 ++ rsb r6, r12, #32 ++ vmla.u16 q12, q3, q10 ++ sub r5, #1 ++ vld1.16 {d16[0]}, [r1] ++ teq r5, #0 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++ vrshr.u16 q11, q11, #5 ++ vrshr.u16 q12, q12, #5 ++ vst1.16 {q11-q12}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_32_neon_10, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ vpush {d8} ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ add sp, #8 ++ mov r10, #8 ++ mov r1, r2 ++1: ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ ++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left ++ add r1, r1, #4*2 ++ mov r6, r4 ++ sub r0, #64 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ add sp, #8 ++ ldrh r7, [r7] ++ mov r10, #-128 ++ vmov.i8 d6, #1<<6 ++1: ++ push {r2, r10} ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ pop {r2, r10} ++ ++ vmov r8, s12 ++ sub r0, #64 ++ add r2, #8 ++ add r0, r0, r3, lsl #2 ++ sub r10, r10, r7, lsl #2 ++ vshr.u8 d6, #1 ++ teq r8, #0 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ add r5, r1, #32 ++ vld1.16 {q1-q2}, [r1] ++ rsb r12, r6, r6, lsl #16 ++ vld1.16 {q3-q4}, [r5] ++ sub r9, r2, #2 ++ rsb r4, r12, #0 ++ rsb r12, r12, #32 << 16 ++ ldrh r7, [r7] ++ mov r8, #-128 ++ vmov d0, d9 ++ vmov s2, r12 ++ add r10, r0, #32 ++ mov r5, #32 ++1: ++ vld1.16 {d1[3]}, [r9] ++ add r8, r7 ++ vmov q11, q4 ++ vmov q10, q3 ++ asr r9, r8, #8 ++ vmov q9, q2 ++ add r9, r2, r9, lsl #1 ++ vmov q8, q1 ++ vext.16 q4, q3, q4, #7 ++ vext.16 q3, q2, q3, #7 ++ vext.16 q2, q1, q2, #7 ++ vext.16 q1, q0, q1, #7 ++2: ++ vmul.u16 q12, q8, d1[1] ++ adds r12, r4 ++ vmla.u16 q12, q1, d1[0] ++ it cc ++ addcc r12, #32 << 16 ++ vmul.u16 q13, q9, d1[1] ++ it cc ++ subcc r12, #32 ++ vmla.u16 q13, q2, d1[0] ++ sub r5, #1 ++ vmul.u16 q14, q10, d1[1] ++ teq r5, #0 ++ vmla.u16 q14, q3, d1[0] ++ vmul.u16 q15, q11, d1[1] ++ vmla.u16 q15, q4, d1[0] ++ vmov s2, r12 ++ vrshr.u16 q12, q12, #5 ++ vrshr.u16 q13, q13, #5 ++ vrshr.u16 q14, q14, #5 ++ vrshr.u16 q15, q15, #5 ++ vst1.16 {q12-q13}, [r0], r3 ++ vst1.16 {q14-q15}, [r10], r3 ++ bhi 2b ++ bne 1b ++ ++ vpop {d8} ++ vmov d9, d0 ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ add r5, r1, #32 ++ vld1.16 {q1-q2}, [r1] ++ rsb r12, r6, r6, lsl #16 ++ vld1.16 {q3-q4}, [r5] ++ add r1, r1, #64 ++ rsb r4, r12, #0 ++ rsb r12, r12, #32 << 16 ++ vmov d1, d9 ++ vmov s1, r12 ++ add r10, r0, #32 ++ mov r5, #32 ++1: ++ vld1.16 {d0[0]}, [r1]! ++ vmov q8, q1 ++ vmov q9, q2 ++ vmov q10, q3 ++ vmov q11, q4 ++ vext.16 q1, q1, q2, #1 ++ vext.16 q2, q2, q3, #1 ++ vext.16 q3, q3, q4, #1 ++ vext.16 q4, q4, q0, #1 ++2: ++ vmul.u16 q12, q1, d0[2] ++ adds r12, r4 ++ vmla.u16 q12, q8, d0[3] ++ it cc ++ addcc r12, #32 << 16 ++ vmul.u16 q13, q2, d0[2] ++ it cc ++ subcc r12, #32 ++ vmla.u16 q13, q9, d0[3] ++ sub r5, #1 ++ vmul.u16 q14, q3, d0[2] ++ teq r5, #0 ++ vmla.u16 q14, q10, d0[3] ++ vmul.u16 q15, q4, d0[2] ++ vmla.u16 q15, q11, d0[3] ++ vmov s1, r12 ++ vrshr.u16 q12, q12, #5 ++ vrshr.u16 q13, q13, #5 ++ vrshr.u16 q14, q14, #5 ++ vrshr.u16 q15, q15, #5 ++ vst1.16 {q12-q13}, [r0], r3 ++ vst1.16 {q14-q15}, [r10], r3 ++ bhi 2b ++ bne 1b ++ ++ vpop {d8} ++ vmov d9, d1 ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++ ++@ Generate 4x4 chroma patch ++@ ++@ In (const) ++@ r1 Up ptr (_up only) ++@ r3 Out stride ++@ r4 Angle add ++@ r7 Inv angle (_up only) ++@ ++@ In/Out (updated) ++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width) ++@ r2 Left ptr - updated ++@ r6 Angle frac (init to r4 + 32) ++@ r8 Inv angle accumulator ++@ q2 Cur Line - load before 1st call for down - set by _up ++@ q8 Cur Line - load before 1st call for up - set by _down ++@ ++@ Temps ++@ r5 Loop counter ++@ r12 ++@ d0, q1, q12-q15 ++ ++patch_h_down_c_4x4_10: ++ vld1.16 {q12}, [r2]! ++ rsb r12, r6, #32 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ mov r5, #4 ++1: ++ vmov q13, q12 ++ vext.16 q12, q12, q12, #2 ++ vld1.32 {d25[1]}, [r2]! ++patch_h_down_c_4x4_10_continue: ++2: ++ vmov q8, q9 ++ subs r12, r4 ++ vmul.u16 q0, q13, q3 ++ it cc ++ addcc r12, #32 ++ vmla.u16 q0, q12, q2 ++ rsb r6, r12, #32 ++ vmov q9, q10 ++ sub r5, #1 ++ vmov q10, q11 ++ teq r5, #0 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vrshr.u16 q11, q0, #5 ++ bhi 2b ++ bne 1b ++ ++ bcs 3f ++ vmov q13, q12 ++ vext.16 q12, q12, q12, #2 ++ vld1.32 {d25[1]}, [r2]! ++3: ++ ++store_tran_c_4x4_10: ++T add r6, r0, r3 ++ vzip.32 q8, q10 ++A add r6, r0, r3 ++T lsl r3, #1 ++ vzip.32 q9, q11 ++A add r5, r0, r3, lsl #1 ++T add r5, r0, r3 ++ vst2.32 {d16,d18}, [r0]! ++A lsl r3, #1 ++ vst2.32 {d17,d19}, [r6], r3 ++ asr r3, #1 ++ vst2.32 {d20,d22}, [r5] ++ mov r5, #4 ++ vst2.32 {d21,d23}, [r6] ++ bx lr ++ ++patch_h_up_c_4x4_10: ++ vld1.16 {q1}, [r2] ++ rsb r12, r6, #32 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ mov r5, #4 ++1: ++ adds r8, r7 ++ vmov q12, q1 ++ it mi ++ ldrmi r6, [r2, #-4]! ++ vext.16 q1, q1, q1, #6 ++ itt pl ++ asrpl r6, r8, #8 ++ ldrpl r6, [r1, r6, lsl #2] ++ vmov s4, r6 ++patch_h_up_c_4x4_10_continue: ++2: ++ vmov q8, q9 ++ subs r12, r4 ++ vmul.u16 q0, q12, q3 ++ it cc ++ addcc r12, #32 ++ vmla.u16 q0, q1, q2 ++ rsb r6, r12, #32 ++ vmov q9, q10 ++ sub r5, #1 ++ vmov q10, q11 ++ teq r5, #0 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vrshr.u16 q11, q0, #5 ++ bhi 2b ++ bne 1b ++ ++ bcs store_tran_c_4x4_10 ++ adds r8, r7 ++ vmov q12, q1 ++ it mi ++ ldrmi r6, [r2, #-4]! ++ vext.16 q1, q1, q1, #6 ++ itt pl ++ asrpl r6, r8, #8 ++ ldrpl r6, [r1, r6, lsl #2] ++ vmov s4, r6 ++ b store_tran_c_4x4_10 ++ ++ ++@ ff_hevc_rpi_pred_angular_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1 ++ ldr r12, [sp] ++ push {r4-r8, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #2 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ bl patch_h_down_c_4x4_10 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r8, #-128 ++ sub r8, r7 ++ bl patch_h_up_c_4x4_10 ++ pop {r4-r8, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.16 {q9}, [r1] ++ sub r1, r2, #4 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ vdup.16 q2, r6 ++ vext.16 q8, q9, q9, #6 ++ sub r8, r7, #128 ++ vld1.32 {d16[0]}, [r1] ++ vdup.16 q3, r12 ++ mov r5, #3 ++1: ++ vmul.u16 q0, q9, q3 ++ subs r12, r4 ++ vmla.u16 q0, q8, q2 ++ ittt cc ++ asrcc r1, r8, #8 ++ addcc r12, #32 ++ addcc r1, r2, r1, lsl #2 ++ vext.16 q10, q8, q8, #6 ++ rsb r6, r12, #32 ++ vmov q11, q8 ++ sub r5, #1 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r8, r7 ++ vld1.32 {d20[0]}, [r1] ++ teq r5, #0 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ vmul.u16 q0, q11, q3 ++ subs r12, r4 ++ vmla.u16 q0, q10, q2 ++ ittt cc ++ asrcc r1, r8, #8 ++ addcc r12, #32 ++ addcc r1, r2, r1, lsl #2 ++ vext.16 q8, q10, q10, #6 ++ rsb r6, r12, #32 ++ vmov q9, q10 ++ sub r5, #1 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r8, r7 ++ vld1.32 {d16[0]}, [r1] ++ teq r5, #0 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmul.u16 q0, q11, q3 ++ vmla.u16 q0, q10, q2 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r8, pc} ++4: ++ bcc 3b ++5: ++ vmul.u16 q0, q9, q3 ++ vmla.u16 q0, q8, q2 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.16 {q9}, [r1]! ++ rsb r12, r6, #32 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vext.16 q8, q9, q9, #2 ++ vld1.32 {d17[1]}, [r1]! ++ mov r5, #3 ++1: ++ vmul.u16 q0, q8, q2 ++ subs r12, r4 ++ vmla.u16 q0, q9, q3 ++ it cc ++ addcc r12, #32 ++ vext.16 q10, q8, q8, #2 ++ rsb r6, r12, #32 ++ vld1.32 {d21[1]}, [r1] ++ sub r5, #1 ++ vmov q11, q8 ++ teq r5, #0 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r1, #4 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ vmul.u16 q0, q10, q2 ++ subs r12, r4 ++ vmla.u16 q0, q11, q3 ++ it cc ++ addcc r12, #32 ++ vext.16 q8, q10, q10, #2 ++ rsb r6, r12, #32 ++ vld1.32 {d17[1]}, [r1] ++ sub r5, #1 ++ vmov q9, q10 ++ teq r5, #0 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r1, #4 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmul.u16 q0, q10, q2 ++ vmla.u16 q0, q11, q3 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r8, pc} ++4: ++ bcc 3b ++5: ++ vmul.u16 q0, q8, q2 ++ vmla.u16 q0, q9, q3 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1 ++ ldr r12, [sp] ++ push {r4-r8, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #2 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 @ save r2 - r1 unused by patch_down ++ ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10_continue ++ ++ add r2, r1, #4*4 @ restore r2, but 4 rows further down left ++ sub r0, #32 ++ mov r6, r4 ++ add r0, r0, r3, lsl #2 ++ ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10_continue ++ ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r8, #-128 ++ sub r8, r7 ++ ++ push {r2, r8} ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10_continue ++ pop {r2, r8} ++ ++ sub r0, #32 ++ mov r6, r4 ++ add r2, #16 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10_continue ++ ++ pop {r4-r8, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.16 {q0-q1}, [r1] ++ sub r9, r2, #4 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ mov r8, #-128 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++ mov r5, #8 ++1: ++ vld1.32 {d17[1]}, [r9] ++ add r8, r7 ++ vmov q2, q0 ++ vmov q3, q1 ++ asr r9, r8, #8 ++ vext.16 q1, q0, q1, #6 ++ add r9, r2, r9, lsl #2 ++ vext.16 q0, q8, q0, #6 ++2: ++ vmul.u16 q11, q2, q10 ++ subs r12, r4 ++ vmla.u16 q11, q0, q9 ++ it cc ++ addcc r12, #32 ++ vmul.u16 q12, q3, q10 ++ rsb r6, r12, #32 ++ vmla.u16 q12, q1, q9 ++ sub r5, #1 ++ teq r5, #0 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++ vrshr.u16 q11, q11, #5 ++ vrshr.u16 q12, q12, #5 ++ vst1.16 {q11-q12}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ add r5, r1, #32 ++ vld1.16 {q0-q1}, [r1]! ++ rsb r12, r6, #32 ++ vld1.32 {d16[0]}, [r5] ++ mov r5, #8 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++1: ++ vmov q2, q0 ++ add r1, #4 ++ vmov q3, q1 ++ vext.16 q0, q0, q1, #2 ++ vext.16 q1, q1, q8, #2 ++2: ++ vmul.u16 q11, q0, q9 ++ subs r12, r4 ++ vmla.u16 q11, q2, q10 ++ it cc ++ addcc r12, #32 ++ vmul.u16 q12, q1, q9 ++ rsb r6, r12, #32 ++ vmla.u16 q12, q3, q10 ++ sub r5, #1 ++ vld1.32 {d16[0]}, [r1] ++ teq r5, #0 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++ vrshr.u16 q11, q11, #5 ++ vrshr.u16 q12, q12, #5 ++ vst1.16 {q11-q12}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1 ++ ldr r12, [sp] ++ push {r4-r10, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #2 ++ vpush {d8} ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ add sp, #8 ++ mov r10, #4 ++ mov r1, r2 ++1: ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10_continue ++ bl patch_h_down_c_4x4_10_continue ++ bl patch_h_down_c_4x4_10_continue ++ ++ add r2, r1, #4*4 @ restore r2, but 4 rows further down left ++ add r1, r1, #4*4 ++ mov r6, r4 ++ sub r0, #64 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 1b ++ ++ pop {r4-r10, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ add sp, #8 ++ mov r10, #4 ++ ldrh r7, [r7] ++ mov r8, #-128 ++ sub r8, r7 ++2: ++ push {r2, r8} ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10_continue ++ bl patch_h_up_c_4x4_10_continue ++ bl patch_h_up_c_4x4_10_continue ++ pop {r2, r8} ++ ++ sub r0, #64 ++ mov r6, r4 ++ add r2, #16 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ subs r10, #1 ++ bne 2b ++ ++ pop {r4-r10, pc} ++ ++@ Left of vertical - works down left ++18: ++ add r5, r1, #32 ++ vld1.16 {q1-q2}, [r1] ++ rsb r12, r6, r6, lsl #16 ++ vld1.16 {q3-q4}, [r5] ++ sub r9, r2, #4 ++ rsb r4, r12, #0 ++ rsb r12, r12, #32 << 16 ++ ldrh r7, [r7] ++ mov r8, #-128 ++ vmov d0, d9 ++ vmov s2, r12 ++ add r10, r0, #32 ++ mov r5, #16 ++1: ++ vld1.32 {d1[1]}, [r9] ++ add r8, r7 ++ vmov q11, q4 ++ vmov q10, q3 ++ asr r9, r8, #8 ++ vmov q9, q2 ++ add r9, r2, r9, lsl #2 ++ vmov q8, q1 ++ vext.16 q4, q3, q4, #6 ++ vext.16 q3, q2, q3, #6 ++ vext.16 q2, q1, q2, #6 ++ vext.16 q1, q0, q1, #6 ++2: ++ vmul.u16 q12, q8, d1[1] ++ adds r12, r4 ++ vmla.u16 q12, q1, d1[0] ++ it cc ++ addcc r12, #32 << 16 ++ vmul.u16 q13, q9, d1[1] ++ it cc ++ subcc r12, #32 ++ vmla.u16 q13, q2, d1[0] ++ sub r5, #1 ++ vmul.u16 q14, q10, d1[1] ++ teq r5, #0 ++ vmla.u16 q14, q3, d1[0] ++ vmul.u16 q15, q11, d1[1] ++ vmla.u16 q15, q4, d1[0] ++ vmov s2, r12 ++ vrshr.u16 q12, q12, #5 ++ vrshr.u16 q13, q13, #5 ++ vrshr.u16 q14, q14, #5 ++ vrshr.u16 q15, q15, #5 ++ vst1.16 {q12-q13}, [r0], r3 ++ vst1.16 {q14-q15}, [r10], r3 ++ bhi 2b ++ bne 1b ++ ++ vpop {d8} ++ vmov d9, d0 ++ pop {r4-r10, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ add r5, r1, #32 ++ vld1.16 {q1-q2}, [r1] ++ rsb r12, r6, r6, lsl #16 ++ vld1.16 {q3-q4}, [r5] ++ add r1, r1, #64 ++ rsb r4, r12, #0 ++ rsb r12, r12, #32 << 16 ++ vmov d1, d9 ++ vmov s1, r12 ++ add r10, r0, #32 ++ mov r5, #16 ++1: ++ vld1.32 {d0[0]}, [r1]! ++ vmov q8, q1 ++ vmov q9, q2 ++ vmov q10, q3 ++ vmov q11, q4 ++ vext.16 q1, q1, q2, #2 ++ vext.16 q2, q2, q3, #2 ++ vext.16 q3, q3, q4, #2 ++ vext.16 q4, q4, q0, #2 ++2: ++ vmul.u16 q12, q1, d0[2] ++ adds r12, r4 ++ vmla.u16 q12, q8, d0[3] ++ it cc ++ addcc r12, #32 << 16 ++ vmul.u16 q13, q2, d0[2] ++ it cc ++ subcc r12, #32 ++ vmla.u16 q13, q9, d0[3] ++ sub r5, #1 ++ vmul.u16 q14, q3, d0[2] ++ teq r5, #0 ++ vmla.u16 q14, q10, d0[3] ++ vmul.u16 q15, q4, d0[2] ++ vmla.u16 q15, q11, d0[3] ++ vmov s1, r12 ++ vrshr.u16 q12, q12, #5 ++ vrshr.u16 q13, q13, #5 ++ vrshr.u16 q14, q14, #5 ++ vrshr.u16 q15, q15, #5 ++ vst1.16 {q12-q13}, [r0], r3 ++ vst1.16 {q14-q15}, [r10], r3 ++ bhi 2b ++ bne 1b ++ ++ vpop {d8} ++ vmov d9, d1 ++ pop {r4-r10, pc} ++ ++endfunc +diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S +new file mode 100644 +index 0000000000..df8c1c25b9 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S +@@ -0,0 +1,705 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++ ++@ ff_hevc_rpi_pred_dc_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_4_neon_8, export=1 ++ ++ @ Average the els of top & left ++ ldr r2, [r2] ++ vld1.32 {d0[0]}, [r1] ++ mov r1, #2 ++ vmov s1, r2 ++ vmov s2, r2 ++ vmov.i16 q2, #3 ++ add r2, r0, r3 ++ vaddl.u8 q1, d0, d1 @ d2[0] = top[0] + left[0] ++ lsl r3, #1 ++ vmovl.u8 q0, d0 ++ vmov.i64 d7, #0xffff ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same) ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vmov.i64 d7, #0xff ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #3 ++ vmla.i16 q0, q2, d6[0] ++ vdup.8 d6, d6[0] ++ vrshrn.i16 d0, q0, #2 ++ ++ @ Store top line ++ vst1.32 {d0[0]}, [r0], r3 ++ ++ @ Store the rest ++ vshr.u64 d1, d0, #5*8 ++ vshr.u64 d2, d0, #6*8 ++ vshr.u64 d3, d0, #7*8 ++ vbif d1, d6, d7 ++ vbif d2, d6, d7 ++ vst1.32 {d1[0]}, [r2], r3 ++ vbif d3, d6, d7 ++ vst1.32 {d2[0]}, [r0] ++ vst1.32 {d3[0]}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {d0}, [r1] ++ vld1.8 {d1}, [r2] ++A add r2, r0, r3, lsl #1 ++A lsl r3, #2 ++T lsl r3, #1 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vaddl.u8 q0, d0, d1 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d2, d0, d0 @ This adds U & V separately ++ vpadd.i32 d3, d0, d0 ++ vrshrn.u16 d0, q1, #3 ++ ++ @ Store ++ vst1.8 {d0}, [r0], r3 ++ vst1.8 {d0}, [r2], r3 ++ vst1.8 {d0}, [r0] ++ vst1.8 {d0}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_8_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {d0}, [r1] ++ mov r1, #2 ++ vld1.8 {d16}, [r2] ++ vmov.i16 q2, #3 ++ vmov.i64 d7, #0xffff ++ vaddl.u8 q1, d0, d16 @ d2[0] = top[0] + left[0] ++ vmovl.u8 q0, d0 ++ vadd.i16 d6, d2, d3 @ d6 has 4 vals ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vmov.i64 d7, #0xff ++ vmovl.u8 q1, d16 ++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #4 ++ vmla.i16 q1, q2, d6[0] ++ vmla.i16 q0, q2, d6[0] ++ vdup.8 d6, d6[0] ++ vrshrn.i16 d2, q1, #2 ++ vrshrn.i16 d0, q0, #2 ++ ++ @ Store top line ++ vst1.8 {d0}, [r0], r3 ++ ++ @ Store the rest ++ vshr.u64 d2, #8 ++ vbit d6, d2, d7 ++ vshr.u64 d2, #8 ++ vst1.8 {d6}, [r0], r3 ++ mov r1, #6 ++1: ++ vbit d6, d2, d7 ++ vshr.u64 d2, #8 ++ vst1.8 {d6}, [r0], r3 ++ subs r1, #2 ++ vbit d6, d2, d7 ++ vshr.u64 d2, #8 ++ vst1.8 {d6}, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q0}, [r1] ++ mov r1, #8 ++ vld1.8 {q1}, [r2] ++T lsl r3, #1 ++ vaddl.u8 q0, d0, d1 ++A add r2, r0, r3, lsl #1 ++A lsl r3, #2 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vaddl.u8 q1, d2, d3 ++ vadd.i16 q1, q0 ++ vadd.i16 d3, d2 @ d3 has 2 val pairs ++ vpadd.i32 d2, d3, d3 @ This add U & V separately ++ vpadd.i32 d3, d3, d3 ++ vrshrn.u16 d0, q1, #4 ++ vrshrn.u16 d1, q1, #4 ++ ++ @ Store ++1: ++ vst1.8 {q0}, [r0], r3 ++ subs r1, #4 ++ vst1.8 {q0}, [r2], r3 ++ vst1.8 {q0}, [r0], r3 ++ vst1.8 {q0}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_16_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q8}, [r1] ++ mov r1, #2 ++ vld1.8 {q9}, [r2] ++ vaddl.u8 q10, d16, d17 ++ vaddl.u8 q11, d16, d18 ++ vaddl.u8 q0, d18, d19 ++ vmov.i16 q1, #3 ++ vadd.i16 q10, q0 ++ vmovl.u8 q0, d18 ++ vadd.i16 d20, d21 ++ vmov.i16 d2[0], r1 @ 2, 3, 3, 3... ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vmovl.u8 q2, d16 ++ vmovl.u8 q9, d19 ++ vpadd.i16 d20, d20 @ 2 (top & bottom of vector the same) ++ vmov.i64 d7, #0xffff ++ vmovl.u8 q8, d17 ++ vbit d4, d22, d7 @ q2 = top[0]+left[0], top[1..7] ++ vmov.i64 d7, #0xff ++ vpadd.i16 d20, d20 @ 1 (all the same) ++ vrshr.u16 d21, d20, #5 ++ vrshr.u16 d20, d20, #5 ++ vmla.i16 q0, q10, d2[1] ++ vmla.i16 q9, q10, d2[1] ++ vmla.i16 q2, q10, q1 ++ vmla.i16 q8, q10, d2[1] ++ vdup.8 q1, d20[0] ++ vrshrn.i16 d0, q0, #2 ++ vrshrn.i16 d1, q9, #2 ++ vrshrn.i16 d4, q2, #2 ++ vrshrn.i16 d5, q8, #2 ++ vext.8 q0, q0, q0, #1 ++ ++ @ Store top line ++ vst1.8 {q2}, [r0], r3 ++ ++ @ Store the rest ++ mov r1, #15 ++1: ++ vbit d2, d0, d7 ++ vext.8 q0, q0, q0, #1 ++ subs r1, #1 ++ vst1.8 {q1}, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q0-q1}, [r1] ++ mov r1, #16 ++ vld1.8 {q2-q3}, [r2] ++T lsl r3, #1 ++ vaddl.u8 q0, d0, d1 ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vaddl.u8 q1, d2, d3 ++A lsl r3, #2 ++T lsl r3, #1 ++ vaddl.u8 q2, d4, d5 ++ vaddl.u8 q3, d6, d7 ++ vadd.i16 q0, q1 ++ vadd.i16 q2, q3 ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d4, d0, d0 @ This adds U & V separately ++ vpadd.i32 d5, d0, d0 ++ vrshrn.u16 d0, q2, #5 ++ vrshrn.u16 d1, q2, #5 ++ vrshrn.u16 d2, q2, #5 ++ vrshrn.u16 d3, q2, #5 ++ ++ @ Store ++1: ++ vst1.8 {q0-q1}, [r0], r3 ++ subs r1, #2 ++ vst1.8 {q0-q1}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_32_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q0-q1}, [r1] ++ mov r1, #32 ++ vld1.8 {q2-q3}, [r2] ++ add r2, r0, r3 ++ vaddl.u8 q0, d0, d1 ++ lsl r3, #1 ++ vaddl.u8 q1, d2, d3 ++ vaddl.u8 q2, d4, d5 ++ vaddl.u8 q3, d6, d7 ++ vadd.i16 q0, q1 ++ vadd.i16 q2, q3 ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 4 vals ++ vpadd.i16 d0, d0 @ 2 (top & bottom the same) ++ vpadd.i16 d4, d0, d0 @ 1 (all the same) ++ vpadd.i16 d5, d0, d0 ++ vrshrn.u16 d0, q2, #6 ++ vrshrn.u16 d1, q2, #6 ++ vrshrn.u16 d2, q2, #6 ++ vrshrn.u16 d3, q2, #6 ++ ++ @ Store ++1: ++ vst1.8 {q0-q1}, [r0], r3 ++ subs r1, #2 ++ vst1.8 {q0-q1}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ----------------------------------------------------------------------------- ++@ ++@ 10 Bit versions ++@ ++@ There is no actual bit depth dependency in this code except that our ++@ intermediate results will overflow the 16 bits they are stored in ++@ All there functions are good to 10 bits - with the worst case being ++@ in dc_32 where we use all 16 bits. ++ ++ ++@ ff_hevc_rpi_pred_dc_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_4_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.16 {d0}, [r1] ++ mov r1, #2 ++ vld1.16 {d1}, [r2] ++T lsl r3, #1 ++ vmov.i16 q2, #3 ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vadd.u16 d2, d0, d1 @ d2[0] = top[0] + left[0] ++A lsl r3, #2 ++T lsl r3, #1 ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vmov.i64 d7, #0xffff ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #3 ++ vmla.i16 q0, q2, d6[0] ++ vrshr.u16 q0, #2 ++ ++ @ Store top line ++ vst1.16 {d0}, [r0], r3 ++ ++ @ Store the rest ++ vshr.u64 d3, d1, #1*16 ++ vshr.u64 d4, d1, #2*16 ++ vshr.u64 d5, d1, #3*16 ++ vbif d3, d6, d7 ++ vbif d4, d6, d7 ++ vst1.16 {d3}, [r2], r3 ++ vbif d5, d6, d7 ++ vst1.16 {d4}, [r0] ++ vst1.16 {d5}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels - needs * 4) ++ ++function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q0}, [r1] ++ vld1.8 {q1}, [r2] ++A add r2, r0, r3, lsl #2 ++A lsl r3, #3 ++T lsl r3, #2 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vadd.i16 q0, q1 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d2, d0, d0 @ This adds U & V separately ++ vpadd.i32 d3, d0, d0 ++ vrshr.u16 q0, q1, #3 ++ ++ vst1.16 {q0}, [r0], r3 ++ vst1.16 {q0}, [r2], r3 ++ vst1.16 {q0}, [r0] ++ vst1.16 {q0}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_8_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.16 {q0}, [r1] ++ mov r1, #2 ++ vld1.16 {q8}, [r2] ++T lsl r3, #1 ++ vmov.i16 q2, #3 ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vadd.i16 q1, q0, q8 @ q1[0] = top[0] + left[0] ++A lsl r3, #2 ++T lsl r3, #1 ++ vmov.i64 d7, #0xffff ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vadd.i16 d6, d2, d3 @ d6 has 4 vals ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #4 ++ vmla.i16 q8, q2, d6[0] ++ vmla.i16 q0, q2, d6[0] ++ vdup.16 q2, d6[0] ++ vdup.16 q9, d6[0] ++ vrshr.u16 q8, q8, #2 ++ vrshr.u16 q0, q0, #2 ++ vext.16 q1, q8, q8, #1 ++ ++ @ Store top line ++ vst1.16 {q0}, [r0], r3 ++ ++ @ Store the rest ++ vbit d18, d2, d7 ++ vst1.16 {q9}, [r2], r3 ++ mov r1, #6 ++1: ++ vext.16 q8, q8, q8, #2 ++ subs r1, #2 ++ vext.16 q1, q1, q1, #2 ++ vbit d4, d16, d7 ++ vst1.16 {q2}, [r0], r3 ++ vbit d18, d2, d7 ++ vst1.16 {q9}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels - needs * 4) ++ ++function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.16 {q0-q1}, [r1] ++ mov r1, #8 ++ vld1.16 {q2-q3}, [r2] ++T lsl r3, #2 ++ vadd.i16 q1, q0 ++A add r2, r0, r3, lsl #2 ++A lsl r3, #3 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vadd.i16 q2, q3 ++ vadd.i16 q1, q2 ++ vadd.i16 d3, d2 @ d3 has 2 val pairs ++ vpadd.i32 d2, d3, d3 @ This add U & V separately ++ vpadd.i32 d3, d3, d3 ++ vrshr.u16 q0, q1, #4 ++ vrshr.u16 q1, q1, #4 ++ ++ @ Store ++1: ++ vst1.8 {q0-q1}, [r0], r3 ++ subs r1, #2 ++ vst1.8 {q0-q1}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_16_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.16 {q8-q9}, [r1] ++ mov r1, #2 ++ vld1.16 {q10-q11}, [r2] ++ lsl r3, #1 @ stride given in pels ++ vadd.i16 q0, q8, q9 ++ vadd.i16 q1, q10, q11 ++ vmov.i16 q3, #3 ++ vadd.i16 q1, q0 ++ vadd.i16 d0, d16, d20 ++ vmov.i64 d31, #0xffff ++ vadd.i16 d3, d2 ++ vmov.16 d6[0], r1 @ 2, 3, 3, 3... ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ topline[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vbit d16, d0, d31 @ q8 = top[0]+left[0], top[1..7] ++ vpadd.i16 d3, d3 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d3, d3 @ 1 (all the same) ++ vrshr.u16 d2, d3, #5 ++ vrshr.u16 d3, d3, #5 ++ vmov q0, q1 ++ vmla.i16 q10, q1, d6[1] ++ vmla.i16 q11, q1, d6[1] ++ vmla.i16 q8, q1, q3 ++ vmla.i16 q9, q1, d6[1] ++ vrshr.u16 q2, q10, #2 ++ vrshr.u16 q3, q11, #2 ++ vrshr.u16 q8, #2 ++ vrshr.u16 q9, #2 ++ vext.16 q2, q2, q2, #1 ++ mov r1, #7<<29 ++ ++ @ Store top line ++ vst1.16 {q8-q9}, [r0], r3 ++ ++ @ Store the rest ++1: ++ vbit d0, d4, d31 ++ vext.16 q2, q2, q2, #1 ++ subs r1, #1<<29 ++ vst1.16 {q0-q1}, [r0], r3 ++ bne 1b ++1: ++ vbit d0, d6, d31 ++ vext.16 q3, q3, q3, #1 ++ subs r1, #1<<29 ++ vst1.16 {q0-q1}, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels - needs * 4) ++ ++function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vldm r1, {q0-q3} ++ vldm r2, {q8-q11} ++ vadd.i16 q0, q1 ++ mov r1, #16 ++ vadd.i16 q2, q3 ++ add r2, r0, #32 ++ vadd.i16 q8, q9 ++ lsl r3, #2 ++ vadd.i16 q10, q11 ++ vadd.u16 q0, q2 ++ vadd.u16 q8, q10 ++ vadd.i16 q0, q8 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d4, d0, d0 @ This adds U & V separately ++ vpadd.i32 d5, d0, d0 ++ vrshr.u16 q0, q2, #5 ++ vrshr.u16 q1, q2, #5 ++ ++ @ Store ++1: ++ vst1.16 {q0-q1}, [r0], r3 ++ subs r1, #1 ++ vst1.16 {q0-q1}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels) ++ ++function ff_hevc_rpi_pred_dc_32_neon_10, export=1 ++ ++ @ Average the els of top & left ++ @ With 10 bits we are (just) safe from overflow in i16 ++ vldm r1, {q0-q3} ++ vldm r2, {q8-q11} ++ vadd.i16 q0, q1 ++ mov r1, #32 ++ vadd.i16 q2, q3 ++ add r2, r0, #32 ++ vadd.i16 q8, q9 ++ lsl r3, #1 ++ vadd.i16 q10, q11 ++ vadd.u16 q0, q2 ++ vadd.u16 q8, q10 ++ vadd.i16 q0, q8 ++ vadd.i16 d0, d1 @ d0 has 4 vals ++ vpadd.i16 d0, d0 @ 2 (top & bottom the same) ++ vpadd.i16 d4, d0, d0 @ 1 (all the same) ++ vpadd.i16 d5, d0, d0 ++ vrshr.u16 q0, q2, #6 ++ vrshr.u16 q1, q2, #6 ++ ++ @ Store ++1: ++ vst1.16 {q0-q1}, [r0], r3 ++ subs r1, #1 ++ vst1.16 {q0-q1}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S +new file mode 100644 +index 0000000000..f6969d3591 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S +@@ -0,0 +1,881 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ All functions have the call ++@ ++@ int ff_hevc_rpi_intra_filter_N_neon_PW( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++@ ++@ Assumptions: ++@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware ++@ if reuseing this code) ++@ ++@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for ++@ N==4, but do for chroma N>=8. As we share Y/C fns that means we can ignore ++@ N==8,PW=8 (chroma always PW>8) but have to cope for larger ++@ ++@ We always have at least 64 pixel H frame width rounding - this lets us ++@ load UR widthout having to worry about exactly how many pixels are actually ++@ within the frame. As partial loads will only occur very occasionally this ++@ should be a win in nearly all cases. ++@ ++@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters ++@ so we do no maths on the contents ++@ ++@ No filtering in 32bit fns as they are chroma only ++ ++ ++.equ AVAIL_UR, 1 ++.equ AVAIL_U, 2 ++.equ AVAIL_UL, 4 ++.equ AVAIL_L, 8 ++.equ AVAIL_DL, 16 ++ ++.equ FILTER_LIGHT, 0x40 ++.equ FILTER_STRONG, 0x80 ++ ++.equ AVAIL_S_UR_N_U_C, 32 - 1 ++.equ AVAIL_S_U_N_UL_C, 32 - 2 ++.equ AVAIL_S_UL_N_L_C, 32 - 3 ++.equ AVAIL_S_L_N_DL_C, 32 - 4 ++ ++.equ AVAIL_S_U_DL_CPSR, 31 - 4 @ Shift for u..dl to go into flags via cpsr ++ ++@ On entry ++@ r2 req ++@ r3 avail ++@ [sp, #sp_offset...] args ++@ ++@ On Exit: ++@ ++@ Extend values: ++@ d_l scalar contains value for L & DL ++@ if DL avail then this is is DL[0] so we don't need to load that ++@ d_ul scalar containing value for UL ++@ d_u scalar containing value for U ++@ d_ur scalar containing value for UR ++@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else... ++@ This means that L-light-filter works even if nreq DL (we never filter ++@ req-DL without req-L, but we do filter req-L without req-DL) ++@ If UR avail then d_ur == a_ur so U-filter good too ++@ ++@ Data load pointers (only load if req & avail): ++@ r4 DL + stride ++@ r10 L ++@ r6 U ++@ r5 UR ++@ ++@ Others: ++@ r2 req ++@ r7 req & avail ++@ r3 L + stride ++@ r8 DL + stride * 2 ++@ r9 stride * 2 ++@ cs Load U ++@ mi Load UR ++@ ++@ Clobbered: ++@ r12 ++ ++.macro load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur ++ ++.equ src_l\@, \sp_offset + 0 ++.equ src_u\@, \sp_offset + 4 ++.equ src_ur\@, \sp_offset + 8 ++.equ stride\@, \sp_offset + 12 ++.equ pw\@, (1 << \pw_s) @ pel width in bytes ++.equ b_size\@, (1 << (\pw_s + \log2_s)) @ size in bytes ++ ++@ r9 stride ++@ r7 = ab_ul, r6 = a_u, r5 = a_ur ++@ r4 = b_dl, r10 = b_l, r8 = b_u ++ ++ ldr r5, [sp, #src_ur\@] ++ lsl r12, r3, #AVAIL_S_U_DL_CPSR ++ ldr r10, [sp, #src_l\@] ++ ldr r9, [sp, #stride\@] ++ ldr r6, [sp, #src_u\@] ++ ++ @ This is quite a slow instruction but it replaces ++ @ a decent number of tests that yield a max of 2 flags/op ++ @ It is annoying we can't branch on Q! ++ @ If L navail (ne) then DL must be navail (pl) ++ msr APSR_nzcvq, r12 @ n=dl, z=l, c=ul, v=u, q=ur ++ ++ mov r4, r5 ++ sub r7, r10, r9 ++ it vs ++ movvs r4, r6 ++ add r8, r6, #b_size\@ - pw\@ ++ it cs ++ movcs r4, r7 ++ ite ne ++ movne r10, r4 ++ addeq r4, r7, r9, lsl #\log2_s ++ it cc ++ movcc r7, r10 ++ it mi ++ addmi r4, r10, r9, lsl #\log2_s ++ vld1.\d_type {\d_ul}, [r7] ++ itt vc ++ movvc r8, r7 ++ movvc r6, r7 ++ vld1.\d_type {\d_l }, [r4], r9 ++ tst r3, #AVAIL_UR ++ vld1.\d_type {\d_u }, [r6] ++ it eq ++ moveq r5, r8 ++ and r7, r2, r3 ++ add r8, r4, r9 ++ vld1.\d_type {\d_ur}, [r5] ++ lsls r12, r7, #AVAIL_S_UR_N_U_C ++ add r3, r10, r9 ++ lsl r9, #1 ++.endm ++ ++ ++ ++@ int ff_hevc_rpi_intra_filter_4_neon_8( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 0 ++.set pw, (1 << pw_s) ++.set log2_s, 2 ++ ++function ff_hevc_rpi_intra_filter_4_neon_8, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[] ++ ++ it cs ++ vldrcs s2, [r6] ++ ite pl ++ vmovpl s3, s4 ++ vldrmi s3, [r5] ++ ++ lsls r7, #AVAIL_S_L_N_DL_C ++ add r12, r0, #-pw ++ bpl 1f ++ ++ vld1.8 {d0[0]}, [r10], r9 ++ vld1.8 {d0[1]}, [r3], r9 ++ vld1.8 {d0[2]}, [r10] ++ vld1.8 {d0[3]}, [r3] ++1: ++ bcc 1f ++ vld1.8 {d0[5]}, [r4], r9 ++ vld1.8 {d0[6]}, [r8] ++ vld1.8 {d0[7]}, [r4] ++1: ++ vstr d1, [r1] @ Up ++ vst1.8 {d31[7]}, [r12] ++ vstr d0, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_4_neon_16( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 1 ++.set pw, (1 << pw_s) ++.set log2_s, 2 ++ ++function ff_hevc_rpi_intra_filter_4_neon_16, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[] ++ ++ it cs ++ vldrcs d2, [r6] ++ it mi ++ vldrmi d3, [r5] ++ lsls r7, #AVAIL_S_L_N_DL_C ++ add r12, r0, #-pw ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10] ++ vld1.16 {d0[3]}, [r3] ++1: ++ bcc 1f ++ vld1.16 {d1[1]}, [r4], r9 ++ vld1.16 {d1[2]}, [r8] ++ vld1.16 {d1[3]}, [r4] ++1: ++ vst1.16 {q1}, [r1] @ Up ++ vst1.16 {d31[3]}, [r12] ++ vst1.16 {q0}, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_8_neon_8( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 0 ++.set pw, (1 << pw_s) ++.set log2_s, 3 ++ ++function ff_hevc_rpi_intra_filter_8_neon_8, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[] ++ ++ it cs ++ vldrcs d4, [r6] ++ it mi ++ vldrmi d5, [r5] ++ ++ lsls r7, #AVAIL_S_L_N_DL_C ++ bpl 1f ++ vld1.8 {d0[0]}, [r10], r9 ++ vld1.8 {d0[1]}, [r3], r9 ++ vld1.8 {d0[2]}, [r10], r9 ++ vld1.8 {d0[3]}, [r3], r9 ++ vld1.8 {d0[4]}, [r10], r9 ++ vld1.8 {d0[5]}, [r3], r9 ++ vld1.8 {d0[6]}, [r10] ++ vld1.8 {d0[7]}, [r3] ++1: ++ bcc 1f ++ vld1.8 {d1[1]}, [r4], r9 ++ vld1.8 {d1[2]}, [r8], r9 ++ vld1.8 {d1[3]}, [r4], r9 ++ vld1.8 {d1[4]}, [r8], r9 ++ vld1.8 {d1[5]}, [r4], r9 ++ vld1.8 {d1[6]}, [r8] ++ vld1.8 {d1[7]}, [r4] ++1: ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f ++ ++ @ Luma light filter ++ vext.8 q8, q15, q2, #15 ++ vext.8 q12, q15, q0, #15 ++ vaddl.u8 q9, d17, d5 ++ vaddl.u8 q8, d16, d4 ++ vaddl.u8 q13, d25, d1 ++ vaddl.u8 q12, d24, d0 ++ vmov.u8 r3, d5[7] @ Save final pel ++ vmov.u8 r2, d1[7] @ Save final pel ++ ++ vext.16 q2, q8, q9, #1 ++ vext.16 q3, q9, q9, #1 ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q13, #1 ++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q2, q8 ++ vadd.u16 q3, q9 ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ ++ vrshrn.u16 d4, q2, #2 ++ vrshrn.u16 d5, q3, #2 ++ vrshrn.u16 d0, q0, #2 ++ vrshrn.u16 d1, q1, #2 ++ vrshr.u16 d30, #2 ++ vmov.u8 d5[7], r3 @ Restore final pel ++ vmov.u8 d1[7], r2 @ Restore final pel ++ vdup.u8 d31, d30[0] @ d31[3] = d30[0] ++ ++10: ++ vst1.8 {q2 }, [r1] @ Up ++ vst1.8 {d31[7]}, [r12] @ Up-left ++ vst1.8 {q0 }, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_8_neon_16( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 1 ++.set pw, (1 << pw_s) ++.set log2_s, 3 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_8_neon_16, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]" ++ ++ it cs ++ vldmcs r6, {d4, d5} ++ ldr r12, [sp, #ur_size] ++ bpl 1f ++ cmp r12, #4 ++ vldm r5, {d6, d7} ++ bgt 1f ++ vdup.16 d7, d6[3] ++1: ++ lsls r12, r7, #AVAIL_S_L_N_DL_C ++ vdup.16 q1, d0[0] ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10], r9 ++ vld1.16 {d0[3]}, [r3], r9 ++ vld1.16 {d1[0]}, [r10], r9 ++ vld1.16 {d1[1]}, [r3], r9 ++ vld1.16 {d1[2]}, [r10] ++ vld1.16 {d1[3]}, [r3] ++1: ++ bcc 1f ++ ldr r12, [sp, #dl_size] ++ vld1.16 {d2[1]}, [r4], r9 ++ cmp r12, #p_size ++ vld1.16 {d2[2]}, [r8], r9 ++ vld1.16 {d2[3]}, [r4], r9 ++ blt 2f ++ vld1.16 {d3[0]}, [r8], r9 ++ vld1.16 {d3[1]}, [r4], r9 ++ vld1.16 {d3[2]}, [r8] ++ vld1.16 {d3[3]}, [r4] ++ b 1f ++2: ++ vdup.16 d3, d2[3] ++1: ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f ++ ++ @ Luma light filter ++ vext.16 q9, q2, q3, #7 ++ vext.16 q8, q15, q2, #7 ++ vext.16 q13, q0, q1, #7 ++ vext.16 q12, q15, q0, #7 ++ vadd.u16 q9, q3 ++ vadd.u16 q8, q2 ++ vadd.u16 q13, q1 ++ vadd.u16 q12, q0 ++ vmov.u16 r3, d7[3] @ Save final pel ++ vmov.u16 r2, d3[3] @ Save final pel ++ ++ vext.16 q2, q8, q9, #1 ++ vext.16 q3, q9, q9, #1 ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q13, #1 ++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q2, q8 ++ vadd.u16 q3, q9 ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ ++ vrshr.u16 q2, #2 ++ vrshr.u16 q3, #2 ++ vrshr.u16 q0, #2 ++ vrshr.u16 q1, #2 ++ vrshr.u16 d30, #2 ++ vmov.u16 d7[3], r3 @ Restore final pel ++ vmov.u16 d3[3], r2 @ Restore final pel ++ vdup.u16 d31, d30[0] @ d31[3] = d30[0] ++ ++10: ++ vst1.16 {q2, q3}, [r1] @ Up ++ vst1.16 {d31[3]}, [r12] @ Up-left ++ vst1.16 {q0, q1}, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++@ int ff_hevc_rpi_intra_filter_16_neon_16( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 1 ++.set pw, (1 << pw_s) ++.set log2_s, 4 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_16_neon_16, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]" ++ ++ vdup.16 q9, d16[0] ++ vdup.16 q11, d20[0] ++ ++ it cs ++ vldmcs r6, {d16-d19} ++ ldr r12, [sp, #ur_size] ++ bpl 1f ++ cmp r12, #12 ++ @ Given chroma frame layout, if UR exists then it is always legit to ++ @ load all of it even if most of it is outside the frame. ++ vldm r5, {d20-d23} ++ bgt 1f ++ bge 4f ++ cmp r12, #8 ++ bge 3f ++ vdup.16 d21, d20[3] ++3: vdup.16 d22, d21[3] ++4: vdup.16 d23, d22[3] ++ ++1: ++ lsls r7, #AVAIL_S_L_N_DL_C ++ ldr r12, [sp, #dl_size] ++ vdup.16 q1, d0[0] ++ vdup.16 q2, d0[0] ++ vdup.16 q3, d0[0] ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10], r9 ++ vld1.16 {d0[3]}, [r3], r9 ++ vld1.16 {d1[0]}, [r10], r9 ++ vld1.16 {d1[1]}, [r3], r9 ++ vld1.16 {d1[2]}, [r10], r9 ++ vld1.16 {d1[3]}, [r3], r9 ++ vld1.16 {d2[0]}, [r10], r9 ++ vld1.16 {d2[1]}, [r3], r9 ++ vld1.16 {d2[2]}, [r10], r9 ++ vld1.16 {d2[3]}, [r3], r9 ++ vld1.16 {d3[0]}, [r10], r9 ++ vld1.16 {d3[1]}, [r3], r9 ++ vld1.16 {d3[2]}, [r10] ++ vld1.16 {d3[3]}, [r3] ++1: ++ bcc 1f ++ vld1.16 {d4[1]}, [r4], r9 ++ cmp r12, #4 ++ vld1.16 {d4[2]}, [r8], r9 ++ vld1.16 {d4[3]}, [r4], r9 ++ ble 2f ++ vld1.16 {d5[0]}, [r8], r9 ++ vld1.16 {d5[1]}, [r4], r9 ++ cmp r12, #12 ++ vld1.16 {d5[2]}, [r8], r9 ++ vld1.16 {d5[3]}, [r4], r9 ++ blt 3f ++ vld1.16 {d6[0]}, [r8], r9 ++ vld1.16 {d6[1]}, [r4], r9 ++ vld1.16 {d6[2]}, [r8], r9 ++ vld1.16 {d6[3]}, [r4], r9 ++ ble 4f ++ vld1.16 {d7[0]}, [r8], r9 ++ vld1.16 {d7[1]}, [r4], r9 ++ vld1.16 {d7[2]}, [r8] ++ vld1.16 {d7[3]}, [r4] ++ b 1f ++2: vdup.16 d5, d4[3] ++3: vdup.16 d6, d5[3] ++4: vdup.16 d7, d6[3] ++1: ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f ++ ++ vpush {q5} ++ @ Luma light filter ++ @ Left ++ vext.16 q5, q2, q3, #7 ++ vext.16 q14, q1, q2, #7 ++ vext.16 q13, q0, q1, #7 ++ vext.16 q12, q15, q0, #7 ++ ++ vadd.u16 q5, q3 ++ vadd.u16 q14, q2 ++ vadd.u16 q13, q1 ++ vadd.u16 q12, q0 ++ vmov.u16 r2, d7[3] @ Save final pel ++ ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q14, #1 ++ vext.16 q2, q14, q5, #1 ++ vext.16 q3, q5, q5, #1 ++ ++ vmov d30, d24 @ d30[0] = l[0] + ul ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ vadd.u16 q2, q14 ++ vadd.u16 q3, q5 ++ ++ vrshr.u16 q0, #2 ++ vrshr.u16 q1, #2 ++ vrshr.u16 q2, #2 ++ vrshr.u16 q3, #2 ++ ++ @ Up ++ vext.16 q5, q10, q11, #7 ++ vext.16 q14, q9, q10, #7 ++ vext.16 q13, q8, q9, #7 ++ vext.16 q12, q15, q8, #7 ++ ++ vadd.u16 q5, q11 ++ vadd.u16 q14, q10 ++ vadd.u16 q13, q9 ++ vadd.u16 q12, q8 ++ vmov.u16 r3, d23[3] @ Save final pel ++ ++ vext.16 q8, q12, q13, #1 ++ vext.16 q9, q13, q14, #1 ++ vext.16 q10, q14, q5, #1 ++ vext.16 q11, q5, q5, #1 ++ ++ vadd.u16 d30, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q8, q12 ++ vadd.u16 q9, q13 ++ vadd.u16 q10, q14 ++ vadd.u16 q11, q5 ++ ++ vrshr.u16 q8, #2 ++ vrshr.u16 q9, #2 ++ vrshr.u16 q10, #2 ++ vrshr.u16 q11, #2 ++ ++ @ Misc ++ vrshr.u16 d30, #2 ++ vmov.u16 d7[3], r2 @ Restore final pel ++ vmov.u16 d23[3], r3 @ Restore final pel ++ vdup.u16 d31, d30[0] @ d31[3] = d30[0] ++ vpop {q5} ++ ++10: ++ vstm r1, {d16-d23} @ Up ++ vst1.16 {d31[3]}, [r12] @ Up-left ++ vstm r0, { d0-d7 } @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++@ int ff_hevc_rpi_intra_filter_4_neon_32( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 2 ++.set pw, (1 << pw_s) ++.set log2_s, 2 ++ ++function ff_hevc_rpi_intra_filter_4_neon_32, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]" ++ ++ it cs ++ vldmcs r6, {d4, d5} ++ it mi ++ vldmmi r5, {d6, d7} ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q1, d0[0] ++ add r12, r0, #-pw ++ bpl 1f ++ vld1.32 {d0[0]}, [r10], r9 ++ vld1.32 {d0[1]}, [r3], r9 ++ vld1.32 {d1[0]}, [r10] ++ vld1.32 {d1[1]}, [r3] ++1: ++ bcc 1f ++ vld1.32 {d2[1]}, [r4], r9 ++ vld1.32 {d3[0]}, [r8] ++ vld1.32 {d3[1]}, [r4] ++1: ++ vst1.32 {q2, q3 }, [r1] @ Up ++ vst1.32 {d31[1]}, [r12] ++ vst1.32 {q0, q1 }, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_8_neon_32( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 2 ++.set pw, (1 << pw_s) ++.set log2_s, 3 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_8_neon_32, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]" ++ ++ vdup.32 q9, d16[0] ++ vdup.32 q11, d20[0] ++ ++ it cs ++ vldmcs r6, {q8, q9 } ++ ldr r12, [sp, #ur_size] ++ bpl 1f ++ cmp r12, #p_size ++ vldm r5, {q10, q11} ++ bge 1f ++ vdup.32 q11, d21[1] ++1: ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q1, d0[0] ++ vdup.32 q2, d0[0] ++ vdup.32 q3, d0[0] ++ bpl 1f ++ vld1.32 {d0[0]}, [r10], r9 ++ vld1.32 {d0[1]}, [r3], r9 ++ vld1.32 {d1[0]}, [r10], r9 ++ vld1.32 {d1[1]}, [r3], r9 ++ vld1.32 {d2[0]}, [r10], r9 ++ vld1.32 {d2[1]}, [r3], r9 ++ vld1.32 {d3[0]}, [r10] ++ vld1.32 {d3[1]}, [r3] ++1: ++ bcc 1f ++ ldr r12, [sp, #dl_size] ++ vld1.32 {d4[1]}, [r4], r9 ++ cmp r12, #p_size ++ vld1.32 {d5[0]}, [r8], r9 ++ vld1.32 {d5[1]}, [r4], r9 ++ blt 2f ++ vld1.32 {d6[0]}, [r8], r9 ++ vld1.32 {d6[1]}, [r4], r9 ++ vld1.32 {d7[0]}, [r8] ++ vld1.32 {d7[1]}, [r4] ++ b 1f ++2: ++ vdup.32 q3, d5[1] ++1: ++ add r12, r0, #-pw ++ vstm r1, { q8-q11} @ Up ++ vst1.32 {d31[1]}, [r12] ++ vstm r0, { q0-q3 } @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_16_neon_32( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 2 ++.set pw, (1 << pw_s) ++.set log2_s, 4 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_16_neon_32, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1] ++ ++ @ Once we get this big we have run out of neon regs to store ++ @ everything at once so do in pieces ++ ++ @ Up (have) ++ it cs ++ vldmcs r6, { q0-q3 } ++ ldr r12, [sp, #ur_size] ++ it mi ++ vldmmi r5, { q8-q11} ++ it cs ++ vstmcs r1, { q0-q3 } ++ bpl 1f ++ cmp r12, #12 ++ add lr, r1, #(pw << log2_s) ++ bgt 2f ++ cmp r12, #8 ++ bge 3f ++ vdup.16 q9, d17[1] ++4: vdup.16 d10, d19[1] ++3: vdup.16 q11, d21[1] ++2: vstm lr, { q8-q11} ++1: ++ ++ @ Left (have) ++ add lr, r0, #-pw ++ lsls r12, r7, #AVAIL_S_L_N_DL_C ++ vst1.32 {d30[1]}, [lr] @ UL ++ bpl 1f ++ vld1.32 { d0[0]}, [r10], r9 ++ vld1.32 { d0[1]}, [r3], r9 ++ vld1.32 { d1[0]}, [r10], r9 ++ vld1.32 { d1[1]}, [r3], r9 ++ vld1.32 { d2[0]}, [r10], r9 ++ vld1.32 { d2[1]}, [r3], r9 ++ vld1.32 { d3[0]}, [r10], r9 ++ vld1.32 { d3[1]}, [r3], r9 ++ vld1.32 { d4[0]}, [r10], r9 ++ vld1.32 { d4[1]}, [r3], r9 ++ vld1.32 { d5[0]}, [r10], r9 ++ vld1.32 { d5[1]}, [r3], r9 ++ vld1.32 { d6[0]}, [r10], r9 ++ vld1.32 { d6[1]}, [r3], r9 ++ vld1.32 { d7[0]}, [r10] ++ vld1.32 { d7[1]}, [r3] ++ vstm r0, { q0-q3 } ++1: ++ bcc 1f ++ ldr r12, [sp, #dl_size] ++ vdup.32 d16, d30[0] @ d16[0] = d30[0] ++ add lr, r0, #(pw << log2_s) ++ vld1.32 {d16[1]}, [r4], r9 ++ cmp r12, #4 ++ vld1.32 {d17[0]}, [r8], r9 ++ vld1.32 {d17[1]}, [r4], r9 ++ ble 2f ++ vld1.32 {d18[0]}, [r8], r9 ++ vld1.32 {d18[1]}, [r4], r9 ++ cmp r12, #12 ++ vld1.32 {d19[0]}, [r8], r9 ++ vld1.32 {d19[1]}, [r4], r9 ++ blt 3f ++ vld1.32 {d20[0]}, [r8], r9 ++ vld1.32 {d20[1]}, [r4], r9 ++ vld1.32 {d21[0]}, [r8], r9 ++ vld1.32 {d21[1]}, [r4], r9 ++ ble 4f ++ vld1.32 {d22[0]}, [r8], r9 ++ vld1.32 {d22[1]}, [r4], r9 ++ vld1.32 {d23[0]}, [r8] ++ vld1.32 {d23[1]}, [r4] ++ b 5f ++2: vdup.32 q9, d17[1] ++3: vdup.32 q10, d19[1] ++4: vdup.32 q11, d21[1] ++5: vstm lr, { q8-q11} ++1: ++ eors r7, r2 ++ beq 99f ++ ++ lsls r12, r7, #AVAIL_S_UR_N_U_C ++ vdup.32 q0, d31[0] ++ vdup.32 q1, d31[0] ++ vdup.32 q2, d31[0] ++ vdup.32 q3, d31[0] ++ add lr, r1, #(pw << log2_s) ++ vdup.32 q8, d31[1] ++ vdup.32 q9, d31[1] ++ vdup.32 q10, d31[1] ++ vdup.32 q11, d31[1] ++ it cs ++ vstmcs r1, { q0-q3 } ++ it mi ++ vstmmi lr, { q8-q11} ++ ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q0, d30[0] ++ vdup.32 q1, d30[0] ++ vdup.32 q2, d30[0] ++ vdup.32 q3, d30[0] ++ add lr, r0, #(pw << log2_s) ++ it mi ++ vstmmi r0, { q0-q3 } ++ it cs ++ vstmcs lr, { q0-q3 } ++ ++99: ++ pop {r4-r10, pc} ++endfunc ++ ++ ++ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S +new file mode 100644 +index 0000000000..56819ae439 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S +@@ -0,0 +1,920 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++/* ++ * Horizontal & Vertical special cases of angular intra pred ++ * ++ * Split out because: ++ * Vertical, at least, is relatively common ++ * Much simpler code than the general angular case ++ * Luma with size < 32 has extra filtering that doesn't happen anywhere else ++ * ++ * *** Currently luma filtering is mandatory where it occurs, but there are ++ * cases where it should be turned off (rdpcm & an extension sps flag). ++ * These don't occur in the standard conformance suite for Main Profile ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ ff_hevc_rpi_pred_vertical_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_4_neon_8, export=1 ++ ldrb ip, [r2, #-1] @ Top-left ++ vld1.32 {d0[0]}, [r2 :32] @ Left ++ add r2, r0, r3 ++ vld1.8 {d1[]}, [r1] ++ lsl r3, #1 ++ vdup.8 d4, ip ++ vmov.i8 d2, #128 ++ vhsub.u8 d4, d0, d4 ++ veor d1, d2 ++ vld1.32 {d0[0]}, [r1 :32] @ Top ++ vqadd.s8 d1, d4 ++ vmov.i64 d3, #0xff ++ vmov d4, d0 ++ veor d5, d1, d2 ++ veor d1, d1, d2 ++ vbit d0, d1, d3 ++ vshr.u64 d5, #8 ++ vst1.32 {d0[0]}, [r0], r3 ++ vshr.u64 d1, #16 ++ vbit d4, d5, d3 ++ vshr.u64 d5, #16 ++ vst1.32 {d4[0]}, [r2], r3 ++ vbit d0, d1, d3 ++ vst1.32 {d0[0]}, [r0] ++ vbit d4, d5, d3 ++ vst1.32 {d4[0]}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_8_neon_8, export=1 ++ ldrb ip, [r2, #-1] @ Top-left ++ vld1.8 {d0}, [r2 :64] @ Left ++ vmov.i8 d1, #128 ++ vld1.8 {d2[]}, [r1] ++ vld1.8 {d3}, [r1 :64] @ Top ++ vdup.8 d4, ip ++ vhsub.u8 d4, d0, d4 ++ veor d2, d1 ++ vmov.i64 d0, #0xff ++ mov r1, #8 ++ vqadd.s8 d2, d4, d2 ++ veor d1, d2, d1 ++1: ++ vbit d3, d1, d0 ++ vshr.u64 d1, #8 ++ vst1.8 {d3}, [r0 :64], r3 ++ subs r1, #2 ++ vbit d3, d1, d0 ++ vshr.u64 d1, #8 ++ vst1.8 {d3}, [r0 :64], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_16_neon_8, export=1 ++ ldrb ip, [r2, #-1] @ Top-left ++ vld1.8 {q0}, [r2 :128] @ Left ++ vdup.8 q1, ip ++ vld1.8 {d4[],d5[]}, [r1] ++ vhsub.u8 q0, q1 ++ vmov.i8 q1, #128 ++ veor q2, q1 ++ vmov.i64 d16, #0xff ++ vqadd.s8 q0, q2 ++ vld1.8 {q3}, [r1 :128] @ Top ++ mov r1, #16 ++ veor q0, q1 ++ vmov q1, q3 ++ vext.8 q2, q0, q0, #1 ++1: ++ vbit d2, d0, d16 ++ vbit d6, d4, d16 ++ vext.8 q0, q0, q0, #2 ++ subs r1, #2 ++ vst1.8 {q1}, [r0 :128], r3 ++ vext.8 q2, q2, q2, #2 ++ vst1.8 {q3}, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vert_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_32_neon_8, export=1 ++ vld1.8 {q0, q1 }, [r1 :128] @ Up ++ add r2, r0, r3 ++ lsl r3, #1 ++ mov r1, #16 ++1: ++ vst1.8 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.8 {q0, q1 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1 ++ vld1.16 {d0 }, [r1 :64] @ Up ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ ++ vst1.16 {d0 }, [r0 :64], r3 ++ vst1.16 {d0 }, [r2 :64], r3 ++ vst1.16 {d0 }, [r0 :64] ++ vst1.16 {d0 }, [r2 :64] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1 ++ vld1.16 {q0 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ mov r1, #4 ++1: ++ vst1.16 {q0 }, [r0 :128], r3 ++ subs r1, #2 ++ vst1.16 {q0 }, [r2 :128], r3 ++ vst1.16 {q0 }, [r0 :128], r3 ++ vst1.16 {q0 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1 ++ vld1.16 {q0, q1 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ mov r1, #8 ++1: ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontalal_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++@ ? Might be faster as simple arm ++ ++function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1 ++ ldrb ip, [r2, #-1] @ Top-left ++ vld1.32 {d0[0]}, [r1 :32] @ Top ++ add r1, r2, #3 ++ vld1.8 {d1[]}, [r2]! ++ vdup.8 d2, ip ++ vmov.i8 d3, #128 ++ vhsub.u8 d0, d2 ++ veor d1, d3 ++ vld1.8 {d2[]}, [r2]! ++ add ip, r0, r3 ++ vqadd.s8 d0, d0, d1 ++ lsl r3, #1 ++ vld1.8 {d1[]}, [r2] ++ vld1.8 {d4[]}, [r1] ++ veor d0, d3 ++ vst1.32 {d0[0]}, [r0 :32], r3 ++ vst1.32 {d2[0]}, [ip :32], r3 ++ vst1.32 {d1[0]}, [r0 :32] ++ vst1.32 {d4[0]}, [ip :32] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1 ++ ldrb ip, [r2, #-1] @ Top-left ++ vld1.8 {d0}, [r1 :64] @ Top ++ vmov.i8 d1, #128 ++ vld1.8 {d2[]}, [r2]! ++ mov r1, #8-2 ++ vdup.8 d3, ip ++ vhsub.u8 d0, d3 ++ veor d2, d1 ++ vqadd.s8 d0, d2 ++ vld1.8 {d2[]}, [r2]! ++ veor d0, d1 ++ vst1.8 {d0}, [r0], r3 ++1: ++ vld1.8 {d0[]}, [r2]! ++ subs r1, #2 ++ vst1.8 {d2}, [r0 :64], r3 ++ vld1.8 {d2[]}, [r2]! ++ vst1.8 {d0}, [r0 :64], r3 ++ bne 1b ++ ++ vst1.8 {d2}, [r0 :64] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1 ++ ldrb ip, [r2, #-1] @ Top-left ++ vld1.8 {q0}, [r1 :64] @ Top ++ mov r1, #16-2 ++ vld1.8 {d4[],d5[]}, [r2]! ++ vdup.8 q3, ip ++ vhsub.u8 q0, q3 ++ vmov.i8 q1, #128 ++ veor q2, q1 ++ vqadd.s8 q0, q2 ++ vld1.8 {d4[],d5[]}, [r2]! ++ veor q0, q1 ++ vst1.8 {q0}, [r0], r3 ++1: ++ vld1.8 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.8 {q2}, [r0 :64], r3 ++ vld1.8 {d4[],d5[]}, [r2]! ++ vst1.8 {q0}, [r0 :64], r3 ++ bne 1b ++ ++ vst1.8 {q2}, [r0 :64] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1 ++ vld1.8 {d0[],d1[]}, [r2]! ++ add ip, r0, #16 ++ mov r1, #32-2 ++ vld1.8 {d2[],d3[]}, [r2]! ++ vst1.8 {q0}, [r0 :128], r3 ++ vst1.8 {q0}, [ip :128], r3 ++1: ++ vld1.8 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.8 {q1}, [r0 :128], r3 ++ vst1.8 {q1}, [ip :128], r3 ++ vld1.8 {d2[],d3[]}, [r2]! ++ vst1.8 {q0}, [r0 :128], r3 ++ vst1.8 {q0}, [ip :128], r3 ++ bne 1b ++ ++ vst1.8 {q1}, [r0 :128] ++ vst1.8 {q1}, [ip :128] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1 ++ add r1, r2, #2 ++ vld1.16 {d0[]}, [r2] ++ add r2, #4 ++ vld1.16 {d1[]}, [r1] ++ add r1, #4 ++ vld1.16 {d2[]}, [r2] ++A add r2, r0, r3, lsl #1 ++T lsl r3, #1 ++T add r2, r0, r3 ++ vld1.16 {d3[]}, [r1] ++A lsl r3, #2 ++T lsl r3, #1 ++ vst1.16 {d0}, [r0 :64], r3 ++ vst1.16 {d1}, [r2 :64], r3 ++ vst1.16 {d2}, [r0 :64] ++ vst1.16 {d3}, [r2 :64] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1 ++ vld1.16 {d0[],d1[]}, [r2]! ++ lsl r3, #1 ++ vld1.16 {d2[],d3[]}, [r2]! ++ mov r1, #8-2 ++ vst1.16 {q0}, [r0 :64], r3 ++1: ++ vld1.16 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.16 {q1}, [r0 :64], r3 ++ vld1.16 {d2[],d3[]}, [r2]! ++ vst1.16 {q0}, [r0 :64], r3 ++ bne 1b ++ ++ vst1.16 {q1}, [r0 :64] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1 ++ vld1.16 {d0[],d1[]}, [r2]! ++ lsl r3, #1 ++ add ip, r0, #16 ++ mov r1, #16-2 ++ vld1.16 {d2[],d3[]}, [r2]! ++ vst1.16 {q0}, [r0 :128], r3 ++ vst1.16 {q0}, [ip :128], r3 ++1: ++ vld1.16 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.16 {q1}, [r0 :128], r3 ++ vst1.16 {q1}, [ip :128], r3 ++ vld1.16 {d2[],d3[]}, [r2]! ++ vst1.16 {q0}, [r0 :128], r3 ++ vst1.16 {q0}, [ip :128], r3 ++ bne 1b ++ ++ vst1.16 {q1}, [r0 :128] ++ vst1.16 {q1}, [ip :128] ++ bx lr ++endfunc ++ ++ ++@------------------------------------------------------------------------------ ++@ ++@ 10 Bit ++@ Has clipping constants so 10-bit only but could easily be macroed up to ++@ 14-bit before we run out of bits ++ ++ ++@ ff_hevc_rpi_pred_vertical_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_4_neon_10, export=1 ++ ldrh ip, [r2, #-2] @ Top-left ++ vld1.16 {d0}, [r2 :64] @ Left ++ vmov.i16 d2, #0 ++ vld1.16 {d1[]}, [r1] ++T lsl r3, #1 ++ vdup.16 d4, ip ++ vmov.i16 d3, #0x3ff ++ vld1.16 {d5}, [r1 :64] @ Top ++ vhsub.u16 d4, d0, d4 ++ vmov.i64 d0, #0xffff ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vadd.i16 d1, d1, d4 ++ vmov d6, d5 ++ vmax.s16 d1, d1, d2 ++ vmin.s16 d2, d1, d3 ++ vmin.s16 d1, d1, d3 ++ vbit d5, d1, d0 ++A lsl r3, #2 ++T lsl r3, #1 ++ vshr.u64 d2, #16 ++ vshr.u64 d1, #32 ++ vbit d6, d2, d0 ++ vst1.16 {d5}, [r0], r3 ++ vshr.u64 d2, #32 ++ vst1.16 {d6}, [r2], r3 ++ vbit d5, d1, d0 ++ vst1.16 {d5}, [r0] ++ vbit d6, d2, d0 ++ vst1.16 {d6}, [r2] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_8_neon_10, export=1 ++ ldrh ip, [r2, #-2] @ Top-left ++ vld1.16 {q0}, [r2 :128] @ Left ++ lsl r3, #1 ++ vdup.16 q1, ip ++ vld1.16 {d4[],d5[]}, [r1] ++ vhsub.u16 q0, q0, q1 ++ vmov.i16 q1, #0 ++ vadd.i16 q0, q2 ++ vmov.i16 q2, #0x3ff ++ vld1.16 {q3}, [r1 :128] @ Top ++ mov r1, #8 ++ vmax.s16 q0, q1 ++ vmov q1, q3 ++ vmin.s16 q0, q2 ++ vmov.i64 d16, #0xffff ++ vext.16 q2, q0, q0, #1 ++1: ++ vbit d2, d0, d16 ++ vbit d6, d4, d16 ++ vext.16 q0, q0, q0, #2 ++ subs r1, #2 ++ vst1.16 {q1}, [r0 :128], r3 ++ vext.16 q2, q2, q2, #2 ++ vst1.16 {q3}, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_16_neon_10, export=1 ++ ldrh ip, [r2, #-2] @ Top-left ++ vld1.16 {q0-q1}, [r2 :128] @ Left ++T lsl r3, #1 ++ vdup.16 q2, ip ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vld1.16 {d6[],d7[]}, [r1] ++A lsl r3, #2 ++T lsl r3, #1 ++ vhsub.u16 q0, q2 ++ vhsub.u16 q1, q2 ++ vadd.i16 q0, q3 ++ vadd.i16 q1, q3 ++ vmov.i16 q2, #0 ++ vld1.16 {q8-q9}, [r1 :128] @ Top ++ mov r1, #0 ++ vmov.i16 q3, #0x3ff ++ vmax.s16 q0, q2 ++ vmax.s16 q1, q2 ++ vmin.s16 q0, q3 ++ vmin.s16 q1, q3 ++ vmov q10, q8 ++ vmov q11, q9 ++ vext.16 q2, q0, q1, #1 ++ vext.16 q3, q1, q1, #1 ++ vmov.i64 d24, #0xffff ++1: ++ vbit d16, d0, d24 ++ vbit d20, d4, d24 ++ vext.16 q0, q0, q0, #2 ++ subs r1, #1<<30 ++ vst1.16 {q8-q9}, [r0 :128], r3 ++ vext.16 q2, q2, q2, #2 ++ vst1.16 {q10-q11}, [r2 :128], r3 ++ bne 1b ++1: ++ vbit d16, d2, d24 ++ vbit d20, d6, d24 ++ vext.16 q1, q1, q1, #2 ++ subs r1, #1<<30 ++ vst1.16 {q8-q9}, [r0 :128], r3 ++ vext.16 q3, q3, q3, #2 ++ vst1.16 {q10-q11}, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_32_neon_10, export=1 ++ vldm r1, { q0-q3 } @ Up ++ lsl r3, #1 ++ mov r1, #32 ++ add r2, r0, #32 ++1: ++ vst1.16 {q0-q1}, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q2-q3}, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1 ++ vld1.16 {q0 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #2 ++ lsl r3, #3 ++ ++ vst1.16 {q0 }, [r0 :128], r3 ++ vst1.16 {q0 }, [r2 :128], r3 ++ vst1.16 {q0 }, [r0 :128] ++ vst1.16 {q0 }, [r2 :128] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1 ++ vld1.16 {q0, q1 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #2 ++ lsl r3, #3 ++ mov r1, #4 ++1: ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1 ++ vldm r1, { q0-q3 } @ Up ++ lsl r3, #2 ++ mov r1, #16 ++ add r2, r0, #32 ++1: ++ vst1.16 {q0-q1}, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q2-q3}, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++@ ff_hevc_rpi_pred_horizontal_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1 ++ ldrh ip, [r2, #-2] @ Top-left ++ vld1.16 {d0}, [r1 :64] @ Top ++ vmov.i16 d1, #0 ++ vld1.16 {d2[]}, [r2]! ++T lsl r3, #1 ++ vdup.16 d3, ip ++ vmov.i16 d4, #0x3ff ++ vhsub.u16 d0, d3 ++A add ip, r0, r3, lsl #1 ++T add ip, r0, r3 ++ vld1.16 {d3[]}, [r2]! ++A lsl r3, #2 ++T lsl r3, #1 ++ vadd.i16 d0, d2 ++ vld1.16 {d2[]}, [r2]! ++ vmax.s16 d0, d1 ++ vld1.16 {d1[]}, [r2] ++ vmin.s16 d0, d4 ++ vst1.16 {d0}, [r0 :64], r3 ++ vst1.16 {d3}, [ip :64], r3 ++ vst1.16 {d2}, [r0 :64] ++ vst1.16 {d1}, [ip :64] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1 ++ ldrh ip, [r2, #-2] @ Top-left ++ vld1.16 {q0}, [r1 :128] @ Top ++ lsl r3, #1 ++ vdup.16 q1, ip ++ mov r1, #8-2 ++ vhsub.u16 q0, q1 ++ vld1.16 {d2[],d3[]}, [r2]! ++ vmov.i16 q2, #0 ++ vadd.i16 q0, q1 ++ vmov.i16 q1, #0x3ff ++ vmax.s16 q0, q2 ++ vld1.16 {d4[],d5[]}, [r2]! ++ vmin.s16 q0, q1 ++ vst1.16 {q0}, [r0 :128], r3 ++1: ++ vld1.16 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.16 {q2}, [r0 :128], r3 ++ vld1.16 {d4[],d5[]}, [r2]! ++ vst1.16 {q0}, [r0 :128], r3 ++ bne 1b ++ ++ vst1.16 {q2}, [r0 :128] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontalal_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1 ++ ldrh ip, [r2, #-2] @ Top-left ++ vld1.16 {q0-q1}, [r1 :128] @ Top ++ lsl r3, #1 ++ vdup.16 q2, ip ++ add ip, r0, r3 ++ vhsub.u16 q0, q2 ++ add ip, #16 ++ vhsub.u16 q1, q2 ++ mov r1, #16-2 ++ vld1.16 {d4[],d5[]}, [r2]! ++ vmov.i16 q3, #0 ++ vadd.u16 q0, q2 ++ vadd.i16 q1, q2 ++ vmov.i16 q2, #0x3ff ++ vmax.s16 q0, q3 ++ vmax.s16 q1, q3 ++ vld1.16 {d6[],d7[]}, [r2]! ++ vmin.s16 q0, q2 ++ vmin.s16 q1, q2 ++ vst1.16 {q0-q1}, [r0 :128], r3 ++1: ++ vld1.16 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.16 {q3}, [r0 :128], r3 ++ vst1.16 {q3}, [ip :128], r3 ++ vld1.16 {d6[],d7[]}, [r2]! ++ vst1.16 {q0}, [r0 :128], r3 ++ vst1.16 {q0}, [ip :128], r3 ++ bne 1b ++ ++ vst1.16 {q3}, [r0 :128] ++ vst1.16 {q3}, [ip :128] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1 ++ vld1.16 {d0[],d1[]}, [r2]! ++ add ip, r0, #16 ++ push {lr} ++ mov lr, #32 ++ vld1.16 {d2[],d3[]}, [r2]! ++ lsl r3, #1 ++ vst1.16 {q0}, [r0 :128], lr ++ sub r3, #32 ++ vst1.16 {q0}, [ip :128], lr ++ mov r1, #32-2 ++ vst1.16 {q0}, [r0 :128], r3 ++ vst1.16 {q0}, [ip :128], r3 ++1: ++ vld1.16 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.16 {q1}, [r0 :128], lr ++ vst1.16 {q1}, [ip :128], lr ++ vst1.16 {q1}, [r0 :128], r3 ++ vst1.16 {q1}, [ip :128], r3 ++ vld1.16 {d2[],d3[]}, [r2]! ++ vst1.16 {q0}, [r0 :128], lr ++ vst1.16 {q0}, [ip :128], lr ++ vst1.16 {q0}, [r0 :128], r3 ++ vst1.16 {q0}, [ip :128], r3 ++ bne 1b ++ ++ vst1.16 {q1}, [r0 :128], lr ++ vst1.16 {q1}, [ip :128], lr ++ vst1.16 {q1}, [r0 :128] ++ vst1.16 {q1}, [ip :128] ++ pop {pc} ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1 ++ add r1, r2, #4 ++ vld1.32 {d0[],d1[]}, [r2] ++ add r2, #8 ++ vld1.32 {d2[],d3[]}, [r1] ++ add r1, #8 ++ vld1.32 {d4[],d5[]}, [r2] ++A add r2, r0, r3, lsl #2 ++T lsl r3, #2 ++T add r2, r0, r3 ++ vld1.32 {d6[],d7[]}, [r1] ++A lsl r3, #3 ++T lsl r3, #1 ++ vst1.32 {q0}, [r0 :128], r3 ++ vst1.32 {q1}, [r2 :128], r3 ++ vst1.32 {q2}, [r0 :128] ++ vst1.32 {q3}, [r2 :128] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1 ++ vld1.32 {d0[],d1[]}, [r2]! ++ lsl r3, #2 ++ add ip, r0, #16 ++ mov r1, #8-2 ++ vld1.32 {d2[],d3[]}, [r2]! ++ vst1.32 {q0}, [r0 :128], r3 ++ vst1.32 {q0}, [ip :128], r3 ++1: ++ vld1.32 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.32 {q1}, [r0 :128], r3 ++ vst1.32 {q1}, [ip :128], r3 ++ vld1.32 {d2[],d3[]}, [r2]! ++ vst1.32 {q0}, [r0 :128], r3 ++ vst1.32 {q0}, [ip :128], r3 ++ bne 1b ++ ++ vst1.32 {q1}, [r0 :128] ++ vst1.32 {q1}, [ip :128] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1 ++ vld1.32 {d0[],d1[]}, [r2]! ++ add ip, r0, #16 ++ push {lr} ++ mov lr, #32 ++ vld1.32 {d2[],d3[]}, [r2]! ++ lsl r3, #2 ++ vst1.32 {q0}, [r0 :128], lr ++ sub r3, #32 ++ vst1.32 {q0}, [ip :128], lr ++ mov r1, #16-2 ++ vst1.32 {q0}, [r0 :128], r3 ++ vst1.32 {q0}, [ip :128], r3 ++1: ++ vld1.32 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.32 {q1}, [r0 :128], lr ++ vst1.32 {q1}, [ip :128], lr ++ vst1.32 {q1}, [r0 :128], r3 ++ vst1.32 {q1}, [ip :128], r3 ++ vld1.32 {d2[],d3[]}, [r2]! ++ vst1.32 {q0}, [r0 :128], lr ++ vst1.32 {q0}, [ip :128], lr ++ vst1.32 {q0}, [r0 :128], r3 ++ vst1.32 {q0}, [ip :128], r3 ++ bne 1b ++ ++ vst1.32 {q1}, [r0 :128], lr ++ vst1.32 {q1}, [ip :128], lr ++ vst1.32 {q1}, [r0 :128] ++ vst1.32 {q1}, [ip :128] ++ pop {pc} ++endfunc ++ ++ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S +new file mode 100644 +index 0000000000..af8c4c03f0 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S +@@ -0,0 +1,1043 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ Planar intra pred (8.4.4.2.4) ++@ ++@ predSamples[ x ][ y ] = ++@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] + ++@ ( x + 1 ) * p[ nTbS ][ -1 ] + ++@ ( nTbS - 1 - y ) * p[ x ][ -1 ] + ++@ ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 ) ++ ++@ All 10-bit functions would work with 9 ++ ++ ++@ ff_hevc_rpi_pred_planar_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_4_neon_8, export=1 ++ ++ vld1.8 {d0}, [r1] @ Top ++ adr ip, nb_3_0_1_4 ++ vld1.8 {d1}, [r2] @ Left ++ vmov.i64 d2, #0xffffffff ++ vldr d3, [ip, #8] @ {1,2,3,4,1,2,3,4} ++ add r1, r0, r3 ++ vdup.32 d4, d0[0] @ {t0,t1,t2,t3,t0,t1,t2,t3} ++ vdup.8 d0, d0[4] @ {t4,t4,t4,t4,t4,t4,t4,t4} ++ vdup.8 d5, d1[4] @ {l4,l4,l4,l4,l4,l4,l4,l4} ++ vdup.8 d6, d1[0] @ {l0,l0,l0,l0,l0,l0,l0,l0} ++ vshll.u8 q8, d4, #2 ++ lsl r3, #1 ++ vsubl.u8 q2, d5, d4 ++ vmlal.u8 q8, d0, d3 ++ vld1.8 {d0}, [ip] @ {3,2,1,0,3,2,1,0} ++ vdup.8 d7, d1[1] @ {l1,l1,l1,l1,l1,l1,l1,l1} ++ vshl.s16 q9, q2, #1 ++ vbif d6, d7, d2 @ {l0,l0,l0,l0,l1,l1,l1,l1} ++ vadd.i16 d16, d4 ++ vdup.8 d7, d1[2] @ {l2,l2,l2,l2,l2,l2,l2,l2} ++ vadd.i16 d17, d18 ++ vdup.8 d1, d1[3] @ {l3,l3,l3,l3,l3,l3,l3,l3} ++ vadd.i16 q2, q8, q9 ++ vmlal.u8 q8, d0, d6 ++ vbif d7, d1, d2 @ {l2,l2,l2,l2,l3,l3,l3,l3} ++ vmlal.u8 q2, d0, d7 ++ vrshrn.i16 d0, q8, #3 ++ vst1.32 d0[0], [r0 :32], r3 ++ vst1.32 d0[1], [r1 :32], r3 ++ vrshrn.i16 d0, q2, #3 ++ vst1.32 d0[0], [r0 :32] ++ vst1.32 d0[1], [r1 :32] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_4_neon_10, export=1 ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ vld1.16 {q0}, [r1 :64] @ Top ++ adr ip, nbh_3_0_1_4 ++ vldr d2, [r2, #8] @ Left (lower) ++ vldr d3, [ip, #8] @ {1,2,3,4} ++T lsl r3, #1 ++ vshl.s16 d4, d0, #2 ++ vdup.16 d1, d1[0] @ {t4,t4,t4,t4} ++ vldr d5, [r2] @ Left (upper) ++ vdup.16 d2, d2[0] @ {l4,l4,l4,l4} ++ vldr d6, [ip] @ {3,2,1,0} ++ vmla.i16 d4, d3, d1 @ Acc set up ++ vsub.i16 d0, d2, d0 @ Add set up ++ vmov d7, d6 ++ vdup.16 d2, d5[0] ++ vdup.16 d3, d5[1] ++ vdup.16 d16, d5[2] ++ vadd.i16 d18, d0, d4 ++ vshl.s16 d0, #1 @ x2 ++ vadd.i16 d19, d0, d4 ++ vdup.16 d17, d5[3] ++ vadd.i16 d4, d0, d18 ++A add r1, r0, r3, lsl #1 ++T add r1, r0, r3 ++ vadd.i16 d5, d0, d19 ++A lsl r3, #2 ++T lsl r3, #1 ++ vmla.i16 q9, q1, q3 ++ vmla.i16 q2, q8, q3 ++ vrshr.u16 q0, q9, #3 ++ vst1.16 {d0}, [r0], r3 ++ vrshr.u16 d2, d4, #3 ++ vst1.16 {d1}, [r1], r3 ++ vrshr.u16 d3, d5, #3 ++ vst1.16 {d2}, [r0] ++ vst1.16 {d3}, [r1] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_8_neon_8, export=1 ++ ++ vld1.8 {q0}, [r1] @ Top ++ adr ip, nb_7_0_1_8 ++ vldr d2, [r2, #8] @ Left (lower) ++ mov r1, #8 ++ vldr d3, [ip, #8] @ {1,2,3,4,5,6,7,8} ++ vshll.u8 q2, d0, #3 ++ vdup.8 d1, d1[0] @ {t8,t8,t8,t8,t8,t8,t8,t8} ++ vdup.8 d2, d2[0] @ {l8,l8,l8,l8,l8,l8,l8,l8} ++ vldr d6, [r2] @ Left (upper) ++ vmlal.u8 q2, d3, d1 ++ vsubl.u8 q0, d2, d0 ++ vldr d7, [ip] @ {7,6,5,4,3,2,1,0} ++ ++@ u8 7..0 [1] d7 ++@ u8 left[y] [1] d6 ++@ u16 acc [2] q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q0 = p[-1][nTbs] - p[x][-1] ++ ++ vdup.8 d2, d6[0] ++ vadd.i16 q2, q0 ++ vdup.8 d3, d6[1] ++ vadd.i16 q8, q2, q0 ++1: ++ vmlal.u8 q2, d7, d2 ++ subs r1, #2 ++ vadd.i16 q9, q8, q0 ++ vmlal.u8 q8, d7, d3 ++ vdup.8 d2, d6[2] ++ vdup.8 d3, d6[3] ++ vrshrn.i16 d20, q2, #4 ++ vshr.u64 d6, #16 ++ vmov q2, q9 ++ vst1.8 {d20}, [r0], r3 ++ vrshrn.i16 d20, q8, #4 ++ vadd.i16 q8, q2, q0 ++ vst1.8 {d20}, [r0], r3 ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_8_neon_10, export=1 ++ ++ adr ip, nb_7_0_1_8 ++ vld1.16 {q0}, [r1 :128]! @ Top (left) ++ lsl r3, #1 ++ vld1.16 {q1}, [ip :128] @ {7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8} ++ add ip, r2, #16 ++ vld1.16 {d4[],d5[]}, [r1] @ Top (right) ++ mov r1, #8-2 ++ vshl.s16 q3, q0, #3 ++ vmovl.u8 q8, d3 @ {1,2,3,4,5,6,7,8} ++ vld1.16 {d18[],d19[]}, [ip] @ Left (lower) ++ vmla.i16 q3, q8, q2 @ Acc set up ++ vsub.i16 q0, q9, q0 @ Add set up ++ vmovl.u8 q1, d2 @ {7,6,5,4,3,2,1,0} ++ vadd.i16 q2, q3, q0 ++ ++@ u16 7..0 [1] q1 ++@ u32 left[y] [1] [r2] ++@ u16 acc [1] q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [1] q0 = p[-1][nTbs] - p[x][-1] ++ ++ vld1.16 {d6[],d7[]}, [r2]! ++ vadd.i16 q8, q2, q0 ++ vld1.16 {d18[],d19[]}, [r2]! ++ vmla.i16 q2, q1, q3 ++ vadd.i16 q3, q8, q0 ++ vmla.i16 q8, q1, q9 ++1: ++ vrshr.u16 q9, q2, #4 ++ subs r1, #2 ++ vmov q2, q3 ++ vrshr.u16 q10, q8, #4 ++ vld1.16 {d6[],d7[]}, [r2]! ++ vst1.16 {q9}, [r0 :128], r3 ++ vadd.i16 q8, q2, q0 ++ vld1.16 {d18[],d19[]}, [r2]! ++ vmla.i16 q2, q1, q3 ++ vadd.i16 q3, q8, q0 ++ vmla.i16 q8, q1, q9 ++ vst1.16 {q10}, [r0 :128], r3 ++ bne 1b ++ ++ vrshr.u16 q9, q2, #4 ++ add r3, r0 ++ vrshr.u16 q10, q8, #4 ++ vst1.16 {q9}, [r0 :128] ++ vst1.16 {q10}, [r3 :128] ++ ++ bx lr ++endfunc ++ ++ ++@------------------------------------------------------------------------------ ++@ ++@ Data - has to be in two lumps to ensure we can always reach using adr ++ ++ .balign 64 ++ ++nb_31_0_1_32: ++ .byte 31, 30, 29, 28, 27, 26, 25, 24 ++ .byte 23, 22, 21, 20, 19, 18, 17, 16 ++nb_15_0_1_16: ++ .byte 15, 14, 13, 12, 11, 10, 9, 8 ++ .byte 7, 6, 5, 4, 3, 2, 1, 0 ++ .byte 1, 2, 3, 4, 5, 6, 7, 8 ++ .byte 9, 10, 11, 12, 13, 14, 15, 16 ++ .byte 17, 18, 19, 20, 21, 22, 23, 24 ++ .byte 25, 26, 27, 28, 29, 30, 31, 32 ++ ++ @ should be back on a 64-byte boundary here ++ ++ @ These could be extracted from the above array, but separate out ++ @ out for better (16 byte) alignment ++nb_3_0_1_4: ++ .byte 3, 2, 1, 0, 3, 2, 1, 0 ++ .byte 1, 2, 3, 4, 1, 2, 3, 4 ++nb_7_0_1_8: ++ .byte 7, 6, 5, 4, 3, 2, 1, 0 ++ .byte 1, 2, 3, 4, 5, 6, 7, 8 ++nbh_3_0_1_4: ++ .short 3, 2, 1, 0, 1, 2, 3, 4 ++ ++@------------------------------------------------------------------------------ ++ ++ ++@ ff_hevc_rpi_pred_planar_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_16_neon_8, export=1 ++ ++ adr ip, nb_15_0_1_16 + 16 ++ vld1.8 {q0}, [r1 :128]! @ Top (left) ++ add r2, #16 ++ vld1.8 {q1}, [ip: 128] @ {1,2,3...16} ++ vld1.8 {d4[]}, [r1] @ Top (right) ++ sub ip, #16 ++ vshll.u8 q3, d0, #4 ++ mov r1, #16 ++ vshll.u8 q8, d1, #4 ++ vld1.8 {d5[]}, [r2] @ Left (lower) ++ sub r2, #16 ++ vmlal.u8 q3, d2, d4 ++ vmlal.u8 q8, d3, d4 @ Acc set up ++ vsubl.u8 q1, d5, d0 ++ vsubl.u8 q0, d5, d1 @ Add set up ++ vld1.8 {q2}, [ip :128] @ {15,14,13...0} ++ ++@ u8 15..0 [1] q2 ++@ u8 left[y] [1] [r2] ++@ u16 acc [2] q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q1,q0 = p[-1][nTbs] - p[x][-1] ++ ++ vadd.i16 q3, q1 ++ vadd.i16 q8, q0 ++1: ++ vadd.i16 q10, q3, q1 ++ subs r1, #2 ++ vld1.8 {d18[]}, [r2]! ++ vadd.i16 q11, q8, q0 ++ vld1.8 {d19[]}, [r2]! ++ vmlal.u8 q3, d4, d18 ++ vmlal.u8 q8, d5, d18 ++ vadd.i16 q12, q10, q1 ++ vmlal.u8 q10, d4, d19 ++ vadd.i16 q13, q11, q0 ++ vmlal.u8 q11, d5, d19 ++ vrshrn.u16 d18, q3, #5 ++ vrshrn.u16 d19, q8, #5 ++ vmov q3, q12 ++ vst1.8 {q9}, [r0 :128], r3 ++ vrshrn.u16 d18, q10, #5 ++ vrshrn.u16 d19, q11, #5 ++ vmov q8, q13 ++ vst1.8 {q9}, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_16_neon_10, export=1 ++ ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr ip, nb_15_0_1_16 + 16 ++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left) ++ add r2, #32 ++ vld1.8 {q2}, [ip :128] @ {1,2,3...16} ++ lsl r3, #1 ++ vld1.16 {d6[],d7[]}, [r1] @ Top (right) ++ sub ip, #16 ++ vmovl.u8 q8, d4 ++ mov r1, #16 ++ vshl.i16 q9, q0, #4 ++ vmovl.u8 q2, d5 ++ vshl.i16 q10, q1, #4 ++ vld1.16 {d22[],d23[]}, [r2] @ Left (lower) ++ sub r2, #32 ++ vld1.8 {q12}, [ip] @ {15,14,13...0} ++ vmla.i16 q9, q8, q3 ++ vmla.i16 q10, q2, q3 @ Acc set up ++ vsub.i16 q0, q11, q0 ++ vsub.i16 q1, q11, q1 @ Add set up ++ vadd.i16 q2, q9, q0 ++ vadd.i16 q3, q10, q1 ++ vmovl.u8 q8, d24 ++ vmovl.u8 q9, d25 ++ ++@ u16 15..0 [2] q8,q9 ++@ u32 left[y] [2] [r2] ++@ u16 acc [2] q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q0,q1 = p[-1][nTbs] - p[x][-1] ++ ++1: ++ vadd.i16 q10, q2, q0 ++ subs r1, #2 ++ vld1.16 {d24[],d25[]}, [r2]! ++ vadd.i16 q11, q3, q1 ++ vld1.16 {d28[],d29[]}, [r2]! ++ vmla.i16 q2, q8, q12 ++ vmla.i16 q3, q9, q12 ++ vadd.i16 q12, q10, q0 ++ vmla.i16 q10, q8, q14 ++ vadd.i16 q13, q11, q1 ++ vmla.i16 q11, q9, q14 ++ vrshr.u16 q14, q2, #5 ++ vrshr.u16 q15, q3, #5 ++ vmov q2, q12 ++ vst1.16 {q14-q15}, [r0 :128], r3 ++ vrshr.u16 q14, q10, #5 ++ vrshr.u16 q15, q11, #5 ++ vmov q3, q13 ++ vst1.16 {q14-q15}, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_32_neon_8, export=1 ++ ++ vld1.8 {q0-q1}, [r1 :128]! @ Top (left) ++ adr ip, nb_31_0_1_32 + 32 ++ vpush {d8-d12} ++ vld1.8 {q2-q3}, [ip :128] @ {1,2,3...32} ++ add r2, #32 ++ vld1.8 {d8[]}, [r1] @ Top (right) ++ sub ip, #32 ++ vshll.u8 q8, d0, #5 ++ mov r1, #32 ++ vld1.8 {d9[]}, [r2] @ Left (lower) ++ sub r2, #32 ++ vshll.u8 q9, d1, #5 ++ vshll.u8 q10, d2, #5 ++ vshll.u8 q11, d3, #5 ++ vmlal.u8 q8, d4, d8 ++ vsubl.u8 q12, d9, d0 ++ vmlal.u8 q9, d5, d8 ++ vsubl.u8 q13, d9, d1 ++ vmlal.u8 q10, d6, d8 ++ vsubl.u8 q14, d9, d2 ++ vmlal.u8 q11, d7, d8 @ Acc set up ++ vsubl.u8 q15, d9, d3 @ Add set up ++ vadd.i16 q8, q12 ++ vadd.i16 q9, q13 ++ vadd.i16 q10, q14 ++ vadd.i16 q11, q15 ++ vld1.8 {q4-q5}, [ip :128] @ {31,30,29...0} ++ ++@ u8 31..0 [2] q4,q5 ++@ u8 left[y] [2] [r2] ++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q12-q15 = p[-1][nTbs] - p[x][-1] ++ ++ vld1.8 {d12[]}, [r2]! ++ vadd.i16 q0, q8, q12 ++ b 2f ++1: ++ vld1.8 {d12[]}, [r2]! ++ vrshrn.u16 d3, q1, #6 ++ vrshrn.u16 d2, q0, #6 ++ vadd.i16 q0, q8, q12 ++ vrshrn.u16 d4, q2, #6 ++ vrshrn.u16 d5, q3, #6 ++ vst1.8 {q1-q2}, [r0 :128], r3 ++2: vadd.i16 q1, q9, q13 ++ subs r1, #2 ++ vadd.i16 q2, q10, q14 ++ vadd.i16 q3, q11, q15 ++ vmlal.u8 q8, d8, d12 ++ vmlal.u8 q9, d9, d12 ++ vmlal.u8 q10, d10, d12 ++ vmlal.u8 q11, d11, d12 ++ vld1.8 {d12[]}, [r2]! ++ vrshrn.u16 d19, q9, #6 ++ vrshrn.u16 d18, q8, #6 ++ vadd.i16 q8, q0, q12 ++ vrshrn.u16 d20, q10, #6 ++ vrshrn.u16 d21, q11, #6 ++ vst1.8 {q9-q10}, [r0 :128], r3 ++ vadd.i16 q9, q1, q13 ++ vadd.i16 q10, q2, q14 ++ vadd.i16 q11, q3, q15 ++ vmlal.u8 q0, d8, d12 ++ vmlal.u8 q1, d9, d12 ++ vmlal.u8 q2, d10, d12 ++ vmlal.u8 q3, d11, d12 ++ ++ bne 1b ++ ++ vpop {d8-d12} ++ ++ vrshrn.u16 d3, q1, #6 ++ vrshrn.u16 d2, q0, #6 ++ vrshrn.u16 d4, q2, #6 ++ vrshrn.u16 d5, q3, #6 ++ vst1.8 {q1-q2}, [r0 :128] ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_32_neon_10, export=1 ++ ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left) ++ adr ip, nb_31_0_1_32 + 32 ++ vpush {q4-q7} ++ vld1.16 {q2-q3}, [r1 :128]! @ Top (centre) ++ add r2, #64 ++ vld1.8 {q14-q15}, [ip :128] @ {1,2,3...32} ++T lsl r3, #1 ++ vld1.16 {d8[],d9[]}, [r1] @ Top (right) ++ sub ip, #32 ++ vmovl.u8 q12, d28 ++ mov r1, #32 ++ vmovl.u8 q13, d29 ++ vld1.8 {q6-q7}, [ip :128] @ {31,30,29...0} ++ vmovl.u8 q14, d30 ++ vmovl.u8 q15, d31 ++ vld1.16 {d10[],d11[]}, [r2] @ Left (lower) ++ sub r2, #64 ++ vshl.i16 q8, q0, #5 ++ vshl.i16 q9, q1, #5 ++ vshl.i16 q10, q2, #5 ++ vshl.i16 q11, q3, #5 ++ vmla.i16 q8, q12, q4 ++ vsub.i16 q0, q5, q0 ++ vmla.i16 q9, q13, q4 ++ vsub.i16 q1, q5, q1 ++ vmla.i16 q10, q14, q4 ++ vmov.u16 ip, d0[0] ++ vsub.i16 q2, q5, q2 ++ vmla.i16 q11, q15, q4 @ Acc set up ++ vsub.i16 q3, q5, q3 @ Add set up ++ vadd.i16 q8, q0 ++ vadd.i16 q9, q1 ++ vadd.i16 q10, q2 ++ vadd.i16 q11, q3 ++ vmovl.u8 q4, d12 ++ vmovl.u8 q5, d13 ++ vmovl.u8 q6, d14 ++ vmovl.u8 q7, d15 ++ ++@ u16 31..0 [4] q4-q7 ++@ u16 left[y] [4] [r2] ++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q0-q3 = p[-1][nTbs] - p[x][-1] ++ ++ vadd.i16 q12, q8, q0 ++A sub r0, r0, r3, lsl #1 ++T sub r0, r3 ++1: ++ vld1.16 {d0[0]}, [r2]! ++A add r0, r0, r3, lsl #1 ++T add r0, r3 ++ vadd.i16 q13, q9, q1 ++ subs r1, #2 ++ vadd.i16 q14, q10, q2 ++ vadd.i16 q15, q11, q3 ++ vmla.i16 q8, q4, d0[0] ++ vmla.i16 q9, q5, d0[0] ++ vmla.i16 q10, q6, d0[0] ++ vmla.i16 q11, q7, d0[0] ++ vmov.16 d0[0], ip ++ vrshr.u16 q8, #6 ++ vrshr.u16 q9, #6 ++ vrshr.u16 q10, #6 ++ vrshr.u16 q11, #6 ++ vstm r0, {q8-q11} ++ vadd.i16 q8, q12, q0 ++A add r0, r0, r3, lsl #1 ++T add r0, r3 ++ vld1.16 {d0[0]}, [r2]! ++ vadd.i16 q9, q13, q1 ++ vadd.i16 q10, q14, q2 ++ vadd.i16 q11, q15, q3 ++ vmla.i16 q12, q4, d0[0] ++ vmla.i16 q13, q5, d0[0] ++ vmla.i16 q14, q6, d0[0] ++ vmla.i16 q15, q7, d0[0] ++ vmov.16 d0[0], ip ++ vrshr.u16 q12, #6 ++ vrshr.u16 q13, #6 ++ vrshr.u16 q14, #6 ++ vrshr.u16 q15, #6 ++ vstm r0, {q12-q15} ++ vadd.i16 q12, q8, q0 ++ bne 1b ++ ++ vpop {q4-q7} ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1 ++ ++ vld1.8 {q0}, [r1] @ Top ++ adr ip, nbx2_3_0_1_4 ++ vldr d2, [r2, #8] @ Left (lower) ++ mov r1, #4 ++ vldr d3, [ip, #8] @ {1,1,2,2,3,3,4,4} ++ lsl r3, #1 ++ vshll.u8 q2, d0, #2 ++ vdup.16 d1, d1[0] @ {t4,t4,t4,t4,t4,t4,t4,t4} ++ vdup.16 d2, d2[0] @ {l4,l4,l4,l4,l4,l4,l4,l4} ++ vldr d6, [r2] @ Left (upper) ++ vmlal.u8 q2, d3, d1 ++ vsubl.u8 q0, d2, d0 ++ vldr d7, [ip] @ {3,3,2,2,1,1,0,0} ++ ++@ u8 3..0 [1] d7 ++@ u8 left[y] [1] d6 ++@ u16 acc [2] q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q0 = p[-1][nTbs] - p[x][-1] ++ ++ vdup.16 d2, d6[0] ++ vadd.i16 q2, q0 ++ vdup.16 d3, d6[1] ++ vadd.i16 q8, q2, q0 ++1: ++ vmlal.u8 q2, d7, d2 ++ subs r1, #2 ++ vadd.i16 q9, q8, q0 ++ vmlal.u8 q8, d7, d3 ++ vdup.16 d2, d6[2] ++ vdup.16 d3, d6[3] ++ vrshrn.i16 d20, q2, #3 ++ vmov q2, q9 ++ vst1.8 {d20}, [r0], r3 ++ vrshrn.i16 d20, q8, #3 ++ vadd.i16 q8, q2, q0 ++ vst1.8 {d20}, [r0], r3 ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1 ++ ++ adr ip, nbx2_3_0_1_4 ++ vld1.16 {q0}, [r1 :128]! @ Top (left) ++ lsl r3, #2 ++ vld1.16 {q1}, [ip :128] @ {3,3,2,2,1,1,0,0,1,1,2,2,3,3,4,4} ++ add ip, r2, #16 ++ vld1.32 {d4[],d5[]}, [r1] @ Top (right) ++ vshl.s16 q3, q0, #2 ++ vmovl.u8 q8, d3 @ {1,1,2,2,3,3,4,4} ++ vld1.32 {d18[],d19[]}, [ip] @ Left (lower) ++ vmla.i16 q3, q8, q2 @ Acc set up ++ vsub.i16 q0, q9, q0 @ Add set up ++ vmovl.u8 q1, d2 @ {3,3,2,2,1,1,0,0} ++ vadd.i16 q2, q3, q0 ++ ++@ u16 3..0 [1] q1 ++@ u32 left[y] [1] [r2] ++@ u16 acc [1] q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [1] q0 = p[-1][nTbs] - p[x][-1] ++ ++ vld1.32 {d6[],d7[]}, [r2]! ++ vadd.i16 q8, q2, q0 ++ vld1.32 {d18[],d19[]}, [r2]! ++ vmla.i16 q2, q1, q3 ++ vadd.i16 q3, q8, q0 ++ vmla.i16 q8, q1, q9 ++ ++ vrshr.u16 q9, q2, #3 ++ vmov q2, q3 ++ vrshr.u16 q10, q8, #3 ++ vld1.32 {d6[],d7[]}, [r2]! ++ vst1.16 {q9}, [r0 :128], r3 ++ vadd.i16 q8, q2, q0 ++ vld1.32 {d18[],d19[]}, [r2]! ++ vmla.i16 q2, q1, q3 ++ vadd.i16 q3, q8, q0 ++ vmla.i16 q8, q1, q9 ++ vst1.16 {q10}, [r0 :128], r3 ++ ++ vrshr.u16 q9, q2, #3 ++ add r3, r0 ++ vrshr.u16 q10, q8, #3 ++ vst1.16 {q9}, [r0 :128] ++ vst1.16 {q10}, [r3 :128] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1 ++ ++ adr ip, nbx2_7_0_1_8 + 16 ++ vld1.8 {q0}, [r1 :128]! @ Top (left) ++ add r2, #16 ++ vld1.8 {q1}, [ip: 128] @ {1,1,2,2,3,3...8,8} ++ lsl r3, #1 ++ vld1.16 {d4[]}, [r1] @ Top (right) ++ sub ip, #16 ++ vshll.u8 q3, d0, #3 ++ mov r1, #8 ++ vshll.u8 q8, d1, #3 ++ vld1.16 {d5[]}, [r2] @ Left (lower) ++ sub r2, #16 ++ vmlal.u8 q3, d2, d4 ++ vmlal.u8 q8, d3, d4 @ Acc set up ++ vsubl.u8 q1, d5, d0 ++ vsubl.u8 q0, d5, d1 @ Add set up ++ vld1.8 {q2}, [ip :128] @ {7,7,6,6,5,5...0,0} ++ ++@ u8 7..0 [1] q2 ++@ u8 left[y] [1] [r2] ++@ u16 acc [2] q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q1,q0 = p[-1][nTbs] - p[x][-1] ++ ++ vadd.i16 q3, q1 ++ vadd.i16 q8, q0 ++1: ++ vadd.i16 q10, q3, q1 ++ subs r1, #2 ++ vld1.16 {d18[]}, [r2]! ++ vadd.i16 q11, q8, q0 ++ vld1.16 {d19[]}, [r2]! ++ vmlal.u8 q3, d4, d18 ++ vmlal.u8 q8, d5, d18 ++ vadd.i16 q12, q10, q1 ++ vmlal.u8 q10, d4, d19 ++ vadd.i16 q13, q11, q0 ++ vmlal.u8 q11, d5, d19 ++ vrshrn.u16 d18, q3, #4 ++ vrshrn.u16 d19, q8, #4 ++ vmov q3, q12 ++ vst1.8 {q9}, [r0 :128], r3 ++ vrshrn.u16 d18, q10, #4 ++ vrshrn.u16 d19, q11, #4 ++ vmov q8, q13 ++ vst1.8 {q9}, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++@------------------------------------------------------------------------------ ++@ ++@ Data - has to be in two lumps to ensure we can always reach using adr ++ ++ .balign 64 ++ ++nbx2_15_0_1_16: ++ .byte 15, 15, 14, 14, 13, 13, 12, 12 ++ .byte 11, 11, 10, 10, 9, 9, 8, 8 ++nbx2_7_0_1_8: ++ .byte 7, 7, 6, 6, 5, 5, 4, 4 ++ .byte 3, 3, 2, 2, 1, 1, 0, 0 ++ .byte 1, 1, 2, 2, 3, 3, 4, 4 ++ .byte 5, 5, 6, 6, 7, 7, 8, 8 ++ .byte 9, 9, 10, 10, 11, 11, 12, 12 ++ .byte 13, 13, 14, 14, 15, 15, 16, 16 ++ ++ @ should be back on a 64-byte boundary here ++ ++nbx2_3_0_1_4: ++ .byte 3, 3, 2, 2, 1, 1, 0, 0 ++ .byte 1, 1, 2, 2, 3, 3, 4, 4 ++ ++@------------------------------------------------------------------------------ ++ ++ ++@ ff_hevc_rpi_pred_planar_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1 ++ ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr ip, nbx2_7_0_1_8 + 16 ++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left) ++ add r2, #32 ++ vld1.8 {q2}, [ip :128] @ {1,1,2,2,3,3...8,8} ++ lsl r3, #2 ++ vld1.32 {d6[],d7[]}, [r1] @ Top (right) ++ sub ip, #16 ++ vmovl.u8 q8, d4 ++ mov r1, #8 ++ vshl.i16 q9, q0, #3 ++ vmovl.u8 q2, d5 ++ vshl.i16 q10, q1, #3 ++ vld1.32 {d22[],d23[]}, [r2] @ Left (lower) ++ sub r2, #32 ++ vld1.8 {q12}, [ip] @ {7,7,6,6,5,5...0,0} ++ vmla.i16 q9, q8, q3 ++ vmla.i16 q10, q2, q3 @ Acc set up ++ vsub.i16 q0, q11, q0 ++ vsub.i16 q1, q11, q1 @ Add set up ++ vadd.i16 q2, q9, q0 ++ vadd.i16 q3, q10, q1 ++ vmovl.u8 q8, d24 ++ vmovl.u8 q9, d25 ++ ++@ u16 7..0 [2] q8,q9 ++@ u32 left[y] [2] [r2] ++@ u16 acc [2] q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q0,q1 = p[-1][nTbs] - p[x][-1] ++ ++1: ++ vadd.i16 q10, q2, q0 ++ subs r1, #2 ++ vld1.32 {d24[],d25[]}, [r2]! ++ vadd.i16 q11, q3, q1 ++ vld1.32 {d28[],d29[]}, [r2]! ++ vmla.i16 q2, q8, q12 ++ vmla.i16 q3, q9, q12 ++ vadd.i16 q12, q10, q0 ++ vmla.i16 q10, q8, q14 ++ vadd.i16 q13, q11, q1 ++ vmla.i16 q11, q9, q14 ++ vrshr.u16 q14, q2, #4 ++ vrshr.u16 q15, q3, #4 ++ vmov q2, q12 ++ vst1.16 {q14-q15}, [r0 :128], r3 ++ vrshr.u16 q14, q10, #4 ++ vrshr.u16 q15, q11, #4 ++ vmov q3, q13 ++ vst1.16 {q14-q15}, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1 ++ ++ vld1.8 {q0-q1}, [r1 :128]! @ Top (left) ++ adr ip, nbx2_15_0_1_16 + 32 ++ vpush {d8-d12} ++ vld1.8 {q2-q3}, [ip :128] @ {1,1,2,2,3,3...16,16} ++ add r2, #32 ++ vld1.16 {d8[]}, [r1] @ Top (right) ++ sub ip, #32 ++ vshll.u8 q8, d0, #4 ++ mov r1, #16 ++ vld1.16 {d9[]}, [r2] @ Left (lower) ++ sub r2, #32 ++ vshll.u8 q9, d1, #4 ++ lsl r3, #1 ++ vshll.u8 q10, d2, #4 ++ vshll.u8 q11, d3, #4 ++ vmlal.u8 q8, d4, d8 ++ vsubl.u8 q12, d9, d0 ++ vmlal.u8 q9, d5, d8 ++ vsubl.u8 q13, d9, d1 ++ vmlal.u8 q10, d6, d8 ++ vsubl.u8 q14, d9, d2 ++ vmlal.u8 q11, d7, d8 @ Acc set up ++ vsubl.u8 q15, d9, d3 @ Add set up ++ vadd.i16 q8, q12 ++ vadd.i16 q9, q13 ++ vadd.i16 q10, q14 ++ vadd.i16 q11, q15 ++ vld1.8 {q4-q5}, [ip :128] @ {15,15,14,14,13,13...0,0} ++ ++@ u8 15..0 [2] q4,q5 ++@ u8 left[y] [2] [r2] ++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q12-q15 = p[-1][nTbs] - p[x][-1] ++ ++ vld1.16 {d12[]}, [r2]! ++ vadd.i16 q0, q8, q12 ++ b 2f ++1: ++ vld1.16 {d12[]}, [r2]! ++ vrshrn.u16 d3, q1, #5 ++ vrshrn.u16 d2, q0, #5 ++ vadd.i16 q0, q8, q12 ++ vrshrn.u16 d4, q2, #5 ++ vrshrn.u16 d5, q3, #5 ++ vst1.8 {q1-q2}, [r0 :128], r3 ++2: vadd.i16 q1, q9, q13 ++ subs r1, #2 ++ vadd.i16 q2, q10, q14 ++ vadd.i16 q3, q11, q15 ++ vmlal.u8 q8, d8, d12 ++ vmlal.u8 q9, d9, d12 ++ vmlal.u8 q10, d10, d12 ++ vmlal.u8 q11, d11, d12 ++ vld1.16 {d12[]}, [r2]! ++ vrshrn.u16 d19, q9, #5 ++ vrshrn.u16 d18, q8, #5 ++ vadd.i16 q8, q0, q12 ++ vrshrn.u16 d20, q10, #5 ++ vrshrn.u16 d21, q11, #5 ++ vst1.8 {q9-q10}, [r0 :128], r3 ++ vadd.i16 q9, q1, q13 ++ vadd.i16 q10, q2, q14 ++ vadd.i16 q11, q3, q15 ++ vmlal.u8 q0, d8, d12 ++ vmlal.u8 q1, d9, d12 ++ vmlal.u8 q2, d10, d12 ++ vmlal.u8 q3, d11, d12 ++ ++ bne 1b ++ ++ vpop {d8-d12} ++ ++ vrshrn.u16 d3, q1, #5 ++ vrshrn.u16 d2, q0, #5 ++ vrshrn.u16 d4, q2, #5 ++ vrshrn.u16 d5, q3, #5 ++ vst1.8 {q1-q2}, [r0 :128] ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1 ++ ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left) ++ adr ip, nbx2_15_0_1_16 + 32 ++ vpush {q4-q7} ++ vld1.16 {q2-q3}, [r1 :128]! @ Top (centre) ++ add r2, #64 ++ vld1.8 {q14-q15}, [ip :128] @ {1,1,2,2,3,3...16,16} ++T lsl r3, #2 ++ vld1.32 {d8[],d9[]}, [r1] @ Top (right) ++ sub ip, #32 ++ vmovl.u8 q12, d28 ++ mov r1, #16 ++ vmovl.u8 q13, d29 ++ vld1.8 {q6-q7}, [ip :128] @ {15,15,14,14,13,13...0,0} ++ vmovl.u8 q14, d30 ++ vmovl.u8 q15, d31 ++ vld1.32 {d10[],d11[]}, [r2] @ Left (lower) ++ sub r2, #64 ++ vshl.i16 q8, q0, #4 ++ vshl.i16 q9, q1, #4 ++ vshl.i16 q10, q2, #4 ++ vshl.i16 q11, q3, #4 ++ vmla.i16 q8, q12, q4 ++ vsub.i16 q0, q5, q0 ++ vmla.i16 q9, q13, q4 ++ vpush {q0} ++ vsub.i16 q1, q5, q1 ++ vmla.i16 q10, q14, q4 ++ vsub.i16 q2, q5, q2 ++ vmla.i16 q11, q15, q4 @ Acc set up ++ vsub.i16 q3, q5, q3 @ Add set up ++ vadd.i16 q8, q0 ++ vadd.i16 q9, q1 ++ vadd.i16 q10, q2 ++ vadd.i16 q11, q3 ++ vmovl.u8 q4, d12 ++ vmovl.u8 q5, d13 ++ vmovl.u8 q6, d14 ++ vmovl.u8 q7, d15 ++ ++@ u16 31..0 [4] q4-q7 ++@ u16 left[y] [4] [r2] ++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q0-q3 = p[-1][nTbs] - p[x][-1] ++ ++ vadd.i16 q12, q8, q0 ++A sub r0, r0, r3, lsl #2 ++T sub r0, r3 ++1: ++ vld1.32 {d0[],d1[]}, [r2]! ++A add r0, r0, r3, lsl #2 ++T add r0, r3 ++ vadd.i16 q13, q9, q1 ++ subs r1, #2 ++ vadd.i16 q14, q10, q2 ++ vadd.i16 q15, q11, q3 ++ vmla.i16 q8, q4, q0 ++ vmla.i16 q9, q5, q0 ++ vmla.i16 q10, q6, q0 ++ vmla.i16 q11, q7, q0 ++ vld1.16 {q0}, [sp] ++ vrshr.u16 q8, #5 ++ vrshr.u16 q9, #5 ++ vrshr.u16 q10, #5 ++ vrshr.u16 q11, #5 ++ vstm r0, {q8-q11} ++ vadd.i16 q8, q12, q0 ++A add r0, r0, r3, lsl #2 ++T add r0, r3 ++ vld1.32 {d0[],d1[]}, [r2]! ++ vadd.i16 q9, q13, q1 ++ vadd.i16 q10, q14, q2 ++ vadd.i16 q11, q15, q3 ++ vmla.i16 q12, q4, q0 ++ vmla.i16 q13, q5, q0 ++ vmla.i16 q14, q6, q0 ++ vmla.i16 q15, q7, q0 ++ vld1.16 {q0}, [sp] ++ vrshr.u16 q12, #5 ++ vrshr.u16 q13, #5 ++ vrshr.u16 q14, #5 ++ vrshr.u16 q15, #5 ++ vstm r0, {q12-q15} ++ vadd.i16 q12, q8, q0 ++ bne 1b ++ ++ vpop {q3-q7} ++ bx lr ++ ++endfunc +diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c +index 2cca784f5a..48cb816b70 100644 +--- a/libavcodec/arm/vc1dsp_init_neon.c ++++ b/libavcodec/arm/vc1dsp_init_neon.c +@@ -19,6 +19,7 @@ + #include + + #include "libavutil/attributes.h" ++#include "libavutil/intreadwrite.h" + #include "libavcodec/vc1dsp.h" + #include "vc1dsp.h" + +@@ -32,6 +33,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *bloc + void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); + void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); + ++void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq); ++ + void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int rnd); + +@@ -77,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + int h, int x, int y); + ++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); ++ ++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) ++{ ++ /* Dealing with starting and stopping, and removing escape bytes, are ++ * comparatively less time-sensitive, so are more clearly expressed using ++ * a C wrapper around the assembly inner loop. Note that we assume a ++ * little-endian machine that supports unaligned loads. */ ++ int dsize = 0; ++ while (size >= 4) ++ { ++ int found = 0; ++ while (!found && (((uintptr_t) dst) & 7) && size >= 4) ++ { ++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; ++ if (!found) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ } ++ if (!found) ++ { ++ int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); ++ dst += skip; ++ src += skip; ++ size -= skip; ++ dsize += skip; ++ while (!found && size >= 4) ++ { ++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; ++ if (!found) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ } ++ } ++ if (found) ++ { ++ *dst++ = *src++; ++ *dst++ = *src++; ++ ++src; ++ size -= 3; ++ dsize += 2; ++ } ++ } ++ while (size > 0) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ return dsize; ++} ++ + #define FN_ASSIGN(X, Y) \ + dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \ + dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon +@@ -92,6 +158,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) + dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; + dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; + ++ dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; ++ dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; ++ dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; ++ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; ++ dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; ++ dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; ++ + dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon; + FN_ASSIGN(1, 0); + FN_ASSIGN(2, 0); +@@ -116,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) + dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; + dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; + dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; ++ ++ dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; + } +diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S +index 93f043bf08..96014fbebc 100644 +--- a/libavcodec/arm/vc1dsp_neon.S ++++ b/libavcodec/arm/vc1dsp_neon.S +@@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, export=1 + vst1.32 {d1[1]}, [r0,:32] + bx lr + endfunc ++ ++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of lower block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter4_neon, export=1 ++ sub r3, r0, r1, lsl #2 ++ vldr d0, .Lcoeffs ++ vld1.32 {d1[0]}, [r0], r1 @ P5 ++ vld1.32 {d2[0]}, [r3], r1 @ P1 ++ vld1.32 {d3[0]}, [r3], r1 @ P2 ++ vld1.32 {d4[0]}, [r0], r1 @ P6 ++ vld1.32 {d5[0]}, [r3], r1 @ P3 ++ vld1.32 {d6[0]}, [r0], r1 @ P7 ++ vld1.32 {d7[0]}, [r3] @ P4 ++ vld1.32 {d16[0]}, [r0] @ P8 ++ vshll.u8 q9, d1, #1 @ 2*P5 ++ vdup.16 d17, r2 @ pq ++ vshll.u8 q10, d2, #1 @ 2*P1 ++ vmovl.u8 q11, d3 @ P2 ++ vmovl.u8 q1, d4 @ P6 ++ vmovl.u8 q12, d5 @ P3 ++ vmls.i16 d20, d22, d0[1] @ 2*P1-5*P2 ++ vmovl.u8 q11, d6 @ P7 ++ vmls.i16 d18, d2, d0[1] @ 2*P5-5*P6 ++ vshll.u8 q2, d5, #1 @ 2*P3 ++ vmovl.u8 q3, d7 @ P4 ++ vmla.i16 d18, d22, d0[1] @ 2*P5-5*P6+5*P7 ++ vmovl.u8 q11, d16 @ P8 ++ vmla.u16 d20, d24, d0[1] @ 2*P1-5*P2+5*P3 ++ vmovl.u8 q12, d1 @ P5 ++ vmls.u16 d4, d6, d0[1] @ 2*P3-5*P4 ++ vmls.u16 d18, d22, d0[0] @ 2*P5-5*P6+5*P7-2*P8 ++ vsub.i16 d1, d6, d24 @ P4-P5 ++ vmls.i16 d20, d6, d0[0] @ 2*P1-5*P2+5*P3-2*P4 ++ vmla.i16 d4, d24, d0[1] @ 2*P3-5*P4+5*P5 ++ vmls.i16 d4, d2, d0[0] @ 2*P3-5*P4+5*P5-2*P6 ++ vabs.s16 d2, d1 ++ vrshr.s16 d3, d18, #3 ++ vrshr.s16 d5, d20, #3 ++ vshr.s16 d2, d2, #1 @ clip ++ vrshr.s16 d4, d4, #3 ++ vabs.s16 d3, d3 @ a2 ++ vshr.s16 d1, d1, #8 @ clip_sign ++ vabs.s16 d5, d5 @ a1 ++ vceq.i16 d7, d2, #0 @ test clip == 0 ++ vabs.s16 d16, d4 @ a0 ++ vshr.s16 d4, d4, #8 @ a0_sign ++ vcge.s16 d18, d5, d3 @ test a1 >= a2 ++ vcge.s16 d17, d16, d17 @ test a0 >= pq ++ vbsl d18, d3, d5 @ a3 ++ vsub.i16 d1, d1, d4 @ clip_sign - a0_sign ++ vorr d3, d7, d17 @ test clip == 0 || a0 >= pq ++ vqsub.u16 d4, d16, d18 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 d5, d18, d16 @ test a3 >= a0 ++ vmul.i16 d0, d4, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 ++ vorr d4, d3, d5 @ test clip == 0 || a0 >= pq || a3 >= a0 ++ vmov.32 r0, d4[1] @ move to gp reg ++ vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ vcge.s16 d4, d0, d2 ++ tst r0, #1 ++ bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered ++ vbsl d4, d2, d0 @ FFMIN(d, clip) ++ vbic d0, d4, d3 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmls.i16 d6, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ vmla.i16 d24, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ vqmovun.s16 d0, q3 ++ vqmovun.s16 d1, q12 ++ vst1.32 {d0[0]}, [r3], r1 ++ vst1.32 {d1[0]}, [r3] ++1: bx lr ++endfunc ++ ++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of right block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter4_neon, export=1 ++ sub r3, r0, #4 @ where to start reading ++ vldr d0, .Lcoeffs ++ vld1.32 {d2}, [r3], r1 ++ sub r0, r0, #1 @ where to start writing ++ vld1.32 {d4}, [r3], r1 ++ vld1.32 {d3}, [r3], r1 ++ vld1.32 {d5}, [r3] ++ vdup.16 d1, r2 @ pq ++ vtrn.8 q1, q2 ++ vtrn.16 d2, d3 @ P1, P5, P3, P7 ++ vtrn.16 d4, d5 @ P2, P6, P4, P8 ++ vshll.u8 q3, d2, #1 @ 2*P1, 2*P5 ++ vmovl.u8 q8, d4 @ P2, P6 ++ vmovl.u8 q9, d3 @ P3, P7 ++ vmovl.u8 q2, d5 @ P4, P8 ++ vmls.i16 q3, q8, d0[1] @ 2*P1-5*P2, 2*P5-5*P6 ++ vshll.u8 q10, d3, #1 @ 2*P3, 2*P7 ++ vmovl.u8 q1, d2 @ P1, P5 ++ vmla.i16 q3, q9, d0[1] @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7 ++ vmls.i16 q3, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8 ++ vmov d2, d3 @ needs to be in an even-numbered vector for when we come to narrow it later ++ vmls.i16 d20, d4, d0[1] @ 2*P3-5*P4 ++ vmla.i16 d20, d3, d0[1] @ 2*P3-5*P4+5*P5 ++ vsub.i16 d3, d4, d2 @ P4-P5 ++ vmls.i16 d20, d17, d0[0] @ 2*P3-5*P4+5*P5-2*P6 ++ vrshr.s16 q3, q3, #3 ++ vabs.s16 d5, d3 ++ vshr.s16 d3, d3, #8 @ clip_sign ++ vrshr.s16 d16, d20, #3 ++ vabs.s16 q3, q3 @ a1, a2 ++ vshr.s16 d5, d5, #1 @ clip ++ vabs.s16 d17, d16 @ a0 ++ vceq.i16 d18, d5, #0 @ test clip == 0 ++ vshr.s16 d16, d16, #8 @ a0_sign ++ vcge.s16 d19, d6, d7 @ test a1 >= a2 ++ vcge.s16 d1, d17, d1 @ test a0 >= pq ++ vsub.i16 d16, d3, d16 @ clip_sign - a0_sign ++ vbsl d19, d7, d6 @ a3 ++ vorr d1, d18, d1 @ test clip == 0 || a0 >= pq ++ vqsub.u16 d3, d17, d19 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 d6, d19, d17 @ test a3 >= a0 @ ++ vmul.i16 d0, d3, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 ++ vorr d3, d1, d6 @ test clip == 0 || a0 >= pq || a3 >= a0 ++ vmov.32 r2, d3[1] @ move to gp reg ++ vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ vcge.s16 d3, d0, d5 ++ tst r2, #1 ++ bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered ++ vbsl d3, d5, d0 @ FFMIN(d, clip) ++ vbic d0, d3, d1 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmla.i16 d2, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ vmls.i16 d4, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ vqmovun.s16 d1, q1 ++ vqmovun.s16 d0, q2 ++ vst2.8 {d0[0], d1[0]}, [r0], r1 ++ vst2.8 {d0[1], d1[1]}, [r0], r1 ++ vst2.8 {d0[2], d1[2]}, [r0], r1 ++ vst2.8 {d0[3], d1[3]}, [r0] ++1: bx lr ++endfunc ++ ++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of lower block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter8_neon, export=1 ++ sub r3, r0, r1, lsl #2 ++ vldr d0, .Lcoeffs ++ vld1.32 {d1}, [r0 :64], r1 @ P5 ++ vld1.32 {d2}, [r3 :64], r1 @ P1 ++ vld1.32 {d3}, [r3 :64], r1 @ P2 ++ vld1.32 {d4}, [r0 :64], r1 @ P6 ++ vld1.32 {d5}, [r3 :64], r1 @ P3 ++ vld1.32 {d6}, [r0 :64], r1 @ P7 ++ vshll.u8 q8, d1, #1 @ 2*P5 ++ vshll.u8 q9, d2, #1 @ 2*P1 ++ vld1.32 {d7}, [r3 :64] @ P4 ++ vmovl.u8 q1, d3 @ P2 ++ vld1.32 {d20}, [r0 :64] @ P8 ++ vmovl.u8 q11, d4 @ P6 ++ vdup.16 q12, r2 @ pq ++ vmovl.u8 q13, d5 @ P3 ++ vmls.i16 q9, q1, d0[1] @ 2*P1-5*P2 ++ vmovl.u8 q1, d6 @ P7 ++ vshll.u8 q2, d5, #1 @ 2*P3 ++ vmls.i16 q8, q11, d0[1] @ 2*P5-5*P6 ++ vmovl.u8 q3, d7 @ P4 ++ vmovl.u8 q10, d20 @ P8 ++ vmla.i16 q8, q1, d0[1] @ 2*P5-5*P6+5*P7 ++ vmovl.u8 q1, d1 @ P5 ++ vmla.i16 q9, q13, d0[1] @ 2*P1-5*P2+5*P3 ++ vsub.i16 q13, q3, q1 @ P4-P5 ++ vmls.i16 q2, q3, d0[1] @ 2*P3-5*P4 ++ vmls.i16 q8, q10, d0[0] @ 2*P5-5*P6+5*P7-2*P8 ++ vabs.s16 q10, q13 ++ vshr.s16 q13, q13, #8 @ clip_sign ++ vmls.i16 q9, q3, d0[0] @ 2*P1-5*P2+5*P3-2*P4 ++ vshr.s16 q10, q10, #1 @ clip ++ vmla.i16 q2, q1, d0[1] @ 2*P3-5*P4+5*P5 ++ vrshr.s16 q8, q8, #3 ++ vmls.i16 q2, q11, d0[0] @ 2*P3-5*P4+5*P5-2*P6 ++ vceq.i16 q11, q10, #0 @ test clip == 0 ++ vrshr.s16 q9, q9, #3 ++ vabs.s16 q8, q8 @ a2 ++ vabs.s16 q9, q9 @ a1 ++ vrshr.s16 q2, q2, #3 ++ vcge.s16 q14, q9, q8 @ test a1 >= a2 ++ vabs.s16 q15, q2 @ a0 ++ vshr.s16 q2, q2, #8 @ a0_sign ++ vbsl q14, q8, q9 @ a3 ++ vcge.s16 q8, q15, q12 @ test a0 >= pq ++ vsub.i16 q2, q13, q2 @ clip_sign - a0_sign ++ vqsub.u16 q9, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q12, q14, q15 @ test a3 >= a0 ++ vorr q8, q11, q8 @ test clip == 0 || a0 >= pq ++ vmul.i16 q0, q9, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 ++ vorr q9, q8, q12 @ test clip == 0 || a0 >= pq || a3 >= a0 ++ vshl.i64 q11, q9, #16 ++ vmov.32 r0, d18[1] @ move to gp reg ++ vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ vmov.32 r2, d19[1] ++ vshr.s64 q9, q11, #48 ++ vcge.s16 q11, q0, q10 ++ vorr q8, q8, q9 ++ and r0, r0, r2 ++ vbsl q11, q10, q0 @ FFMIN(d, clip) ++ tst r0, #1 ++ bne 1f @ none of the 8 pixel pairs should be updated in this case ++ vbic q0, q11, q8 @ set each d to zero if it should not be filtered ++ vmls.i16 q3, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ vmla.i16 q1, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ vqmovun.s16 d0, q3 ++ vqmovun.s16 d1, q1 ++ vst1.32 {d0}, [r3 :64], r1 ++ vst1.32 {d1}, [r3 :64] ++1: bx lr ++endfunc ++ ++.align 5 ++.Lcoeffs: ++.quad 0x00050002 ++ ++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of right block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter8_neon, export=1 ++ push {lr} ++ sub r3, r0, #4 @ where to start reading ++ vldr d0, .Lcoeffs ++ vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]... ++ sub r0, r0, #1 @ where to start writing ++ vld1.32 {d4}, [r3], r1 ++ add r12, r0, r1, lsl #2 ++ vld1.32 {d3}, [r3], r1 ++ vld1.32 {d5}, [r3], r1 ++ vld1.32 {d6}, [r3], r1 ++ vld1.32 {d16}, [r3], r1 ++ vld1.32 {d7}, [r3], r1 ++ vld1.32 {d17}, [r3] ++ vtrn.8 q1, q2 @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]... ++ vdup.16 q9, r2 @ pq ++ vtrn.16 d2, d3 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]... ++ vtrn.16 d4, d5 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]... ++ vtrn.8 q3, q8 @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]... ++ vtrn.16 d6, d7 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]... ++ vtrn.16 d16, d17 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]... ++ vtrn.32 d2, d6 @ P1, P5 ++ vtrn.32 d4, d16 @ P2, P6 ++ vtrn.32 d3, d7 @ P3, P7 ++ vtrn.32 d5, d17 @ P4, P8 ++ vshll.u8 q10, d2, #1 @ 2*P1 ++ vshll.u8 q11, d6, #1 @ 2*P5 ++ vmovl.u8 q12, d4 @ P2 ++ vmovl.u8 q13, d16 @ P6 ++ vmovl.u8 q14, d3 @ P3 ++ vmls.i16 q10, q12, d0[1] @ 2*P1-5*P2 ++ vmovl.u8 q12, d7 @ P7 ++ vshll.u8 q1, d3, #1 @ 2*P3 ++ vmls.i16 q11, q13, d0[1] @ 2*P5-5*P6 ++ vmovl.u8 q2, d5 @ P4 ++ vmovl.u8 q8, d17 @ P8 ++ vmla.i16 q11, q12, d0[1] @ 2*P5-5*P6+5*P7 ++ vmovl.u8 q3, d6 @ P5 ++ vmla.i16 q10, q14, d0[1] @ 2*P1-5*P2+5*P3 ++ vsub.i16 q12, q2, q3 @ P4-P5 ++ vmls.i16 q1, q2, d0[1] @ 2*P3-5*P4 ++ vmls.i16 q11, q8, d0[0] @ 2*P5-5*P6+5*P7-2*P8 ++ vabs.s16 q8, q12 ++ vshr.s16 q12, q12, #8 @ clip_sign ++ vmls.i16 q10, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4 ++ vshr.s16 q8, q8, #1 @ clip ++ vmla.i16 q1, q3, d0[1] @ 2*P3-5*P4+5*P5 ++ vrshr.s16 q11, q11, #3 ++ vmls.i16 q1, q13, d0[0] @ 2*P3-5*P4+5*P5-2*P6 ++ vceq.i16 q13, q8, #0 @ test clip == 0 ++ vrshr.s16 q10, q10, #3 ++ vabs.s16 q11, q11 @ a2 ++ vabs.s16 q10, q10 @ a1 ++ vrshr.s16 q1, q1, #3 ++ vcge.s16 q14, q10, q11 @ test a1 >= a2 ++ vabs.s16 q15, q1 @ a0 ++ vshr.s16 q1, q1, #8 @ a0_sign ++ vbsl q14, q11, q10 @ a3 ++ vcge.s16 q9, q15, q9 @ test a0 >= pq ++ vsub.i16 q1, q12, q1 @ clip_sign - a0_sign ++ vqsub.u16 q10, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q11, q14, q15 @ test a3 >= a0 ++ vorr q9, q13, q9 @ test clip == 0 || a0 >= pq ++ vmul.i16 q0, q10, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 ++ vorr q10, q9, q11 @ test clip == 0 || a0 >= pq || a3 >= a0 ++ vmov.32 r2, d20[1] @ move to gp reg ++ vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ vmov.32 r3, d21[1] ++ vcge.s16 q10, q0, q8 ++ and r14, r2, r3 ++ vbsl q10, q8, q0 @ FFMIN(d, clip) ++ tst r14, #1 ++ bne 2f @ none of the 8 pixel pairs should be updated in this case ++ vbic q0, q10, q9 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmla.i16 q3, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ vmls.i16 q2, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ vqmovun.s16 d1, q3 ++ vqmovun.s16 d0, q2 ++ tst r2, #1 ++ bne 1f @ none of the first 4 pixel pairs should be updated if so ++ vst2.8 {d0[0], d1[0]}, [r0], r1 ++ vst2.8 {d0[1], d1[1]}, [r0], r1 ++ vst2.8 {d0[2], d1[2]}, [r0], r1 ++ vst2.8 {d0[3], d1[3]}, [r0] ++1: tst r3, #1 ++ bne 2f @ none of the second 4 pixel pairs should be updated if so ++ vst2.8 {d0[4], d1[4]}, [r12], r1 ++ vst2.8 {d0[5], d1[5]}, [r12], r1 ++ vst2.8 {d0[6], d1[6]}, [r12], r1 ++ vst2.8 {d0[7], d1[7]}, [r12] ++2: pop {pc} ++endfunc ++ ++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of lower block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter16_neon, export=1 ++ vpush {d8-d15} ++ sub r3, r0, r1, lsl #2 ++ vldr d0, .Lcoeffs ++ vld1.64 {q1}, [r0 :128], r1 @ P5 ++ vld1.64 {q2}, [r3 :128], r1 @ P1 ++ vld1.64 {q3}, [r3 :128], r1 @ P2 ++ vld1.64 {q4}, [r0 :128], r1 @ P6 ++ vld1.64 {q5}, [r3 :128], r1 @ P3 ++ vld1.64 {q6}, [r0 :128], r1 @ P7 ++ vshll.u8 q7, d2, #1 @ 2*P5[0..7] ++ vshll.u8 q8, d4, #1 @ 2*P1[0..7] ++ vld1.64 {q9}, [r3 :128] @ P4 ++ vmovl.u8 q10, d6 @ P2[0..7] ++ vld1.64 {q11}, [r0 :128] @ P8 ++ vmovl.u8 q12, d8 @ P6[0..7] ++ vdup.16 q13, r2 @ pq ++ vshll.u8 q2, d5, #1 @ 2*P1[8..15] ++ vmls.i16 q8, q10, d0[1] @ 2*P1[0..7]-5*P2[0..7] ++ vshll.u8 q10, d3, #1 @ 2*P5[8..15] ++ vmovl.u8 q3, d7 @ P2[8..15] ++ vmls.i16 q7, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7] ++ vmovl.u8 q4, d9 @ P6[8..15] ++ vmovl.u8 q14, d10 @ P3[0..7] ++ vmovl.u8 q15, d12 @ P7[0..7] ++ vmls.i16 q2, q3, d0[1] @ 2*P1[8..15]-5*P2[8..15] ++ vshll.u8 q3, d10, #1 @ 2*P3[0..7] ++ vmls.i16 q10, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15] ++ vmovl.u8 q6, d13 @ P7[8..15] ++ vmla.i16 q8, q14, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] ++ vmovl.u8 q14, d18 @ P4[0..7] ++ vmovl.u8 q9, d19 @ P4[8..15] ++ vmla.i16 q7, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] ++ vmovl.u8 q15, d11 @ P3[8..15] ++ vshll.u8 q5, d11, #1 @ 2*P3[8..15] ++ vmls.i16 q3, q14, d0[1] @ 2*P3[0..7]-5*P4[0..7] ++ vmla.i16 q2, q15, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] ++ vmovl.u8 q15, d22 @ P8[0..7] ++ vmovl.u8 q11, d23 @ P8[8..15] ++ vmla.i16 q10, q6, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] ++ vmovl.u8 q6, d2 @ P5[0..7] ++ vmovl.u8 q1, d3 @ P5[8..15] ++ vmls.i16 q5, q9, d0[1] @ 2*P3[8..15]-5*P4[8..15] ++ vmls.i16 q8, q14, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] ++ vmls.i16 q7, q15, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] ++ vsub.i16 q15, q14, q6 @ P4[0..7]-P5[0..7] ++ vmla.i16 q3, q6, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] ++ vrshr.s16 q8, q8, #3 ++ vmls.i16 q2, q9, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] ++ vrshr.s16 q7, q7, #3 ++ vmls.i16 q10, q11, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] ++ vabs.s16 q11, q15 ++ vabs.s16 q8, q8 @ a1[0..7] ++ vmla.i16 q5, q1, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] ++ vshr.s16 q15, q15, #8 @ clip_sign[0..7] ++ vrshr.s16 q2, q2, #3 ++ vmls.i16 q3, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] ++ vabs.s16 q7, q7 @ a2[0..7] ++ vrshr.s16 q10, q10, #3 ++ vsub.i16 q12, q9, q1 @ P4[8..15]-P5[8..15] ++ vshr.s16 q11, q11, #1 @ clip[0..7] ++ vmls.i16 q5, q4, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] ++ vcge.s16 q4, q8, q7 @ test a1[0..7] >= a2[0..7] ++ vabs.s16 q2, q2 @ a1[8..15] ++ vrshr.s16 q3, q3, #3 ++ vabs.s16 q10, q10 @ a2[8..15] ++ vbsl q4, q7, q8 @ a3[0..7] ++ vabs.s16 q7, q12 ++ vshr.s16 q8, q12, #8 @ clip_sign[8..15] ++ vrshr.s16 q5, q5, #3 ++ vcge.s16 q12, q2, q10 @ test a1[8..15] >= a2[8.15] ++ vshr.s16 q7, q7, #1 @ clip[8..15] ++ vbsl q12, q10, q2 @ a3[8..15] ++ vabs.s16 q2, q3 @ a0[0..7] ++ vceq.i16 q10, q11, #0 @ test clip[0..7] == 0 ++ vshr.s16 q3, q3, #8 @ a0_sign[0..7] ++ vsub.i16 q3, q15, q3 @ clip_sign[0..7] - a0_sign[0..7] ++ vcge.s16 q15, q2, q13 @ test a0[0..7] >= pq ++ vorr q10, q10, q15 @ test clip[0..7] == 0 || a0[0..7] >= pq ++ vqsub.u16 q15, q2, q4 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q2, q4, q2 @ test a3[0..7] >= a0[0..7] ++ vabs.s16 q4, q5 @ a0[8..15] ++ vshr.s16 q5, q5, #8 @ a0_sign[8..15] ++ vmul.i16 q15, q15, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 ++ vcge.s16 q13, q4, q13 @ test a0[8..15] >= pq ++ vorr q2, q10, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] ++ vsub.i16 q5, q8, q5 @ clip_sign[8..15] - a0_sign[8..15] ++ vceq.i16 q8, q7, #0 @ test clip[8..15] == 0 ++ vshr.u16 q15, q15, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 ++ vmov.32 r0, d4[1] @ move to gp reg ++ vorr q8, q8, q13 @ test clip[8..15] == 0 || a0[8..15] >= pq ++ vqsub.u16 q13, q4, q12 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vmov.32 r2, d5[1] ++ vcge.s16 q4, q12, q4 @ test a3[8..15] >= a0[8..15] ++ vshl.i64 q2, q2, #16 ++ vcge.s16 q12, q15, q11 ++ vmul.i16 q0, q13, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 ++ vorr q4, q8, q4 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] ++ vshr.s64 q2, q2, #48 ++ and r0, r0, r2 ++ vbsl q12, q11, q15 @ FFMIN(d[0..7], clip[0..7]) ++ vshl.i64 q11, q4, #16 ++ vmov.32 r2, d8[1] ++ vshr.u16 q0, q0, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 ++ vorr q2, q10, q2 ++ vmov.32 r12, d9[1] ++ vshr.s64 q4, q11, #48 ++ vcge.s16 q10, q0, q7 ++ vbic q2, q12, q2 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) ++ vorr q4, q8, q4 ++ and r2, r2, r12 ++ vbsl q10, q7, q0 @ FFMIN(d[8..15], clip[8..15]) ++ vmls.i16 q14, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7] ++ and r0, r0, r2 ++ vbic q0, q10, q4 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) ++ tst r0, #1 ++ bne 1f @ none of the 16 pixel pairs should be updated in this case ++ vmla.i16 q6, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7] ++ vmls.i16 q9, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15] ++ vqmovun.s16 d4, q14 ++ vmla.i16 q1, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15] ++ vqmovun.s16 d0, q6 ++ vqmovun.s16 d5, q9 ++ vqmovun.s16 d1, q1 ++ vst1.64 {q2}, [r3 :128], r1 ++ vst1.64 {q0}, [r3 :128] ++1: vpop {d8-d15} ++ bx lr ++endfunc ++ ++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of right block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter16_neon, export=1 ++ push {r4-r6,lr} ++ vpush {d8-d15} ++ sub r3, r0, #4 @ where to start reading ++ vldr d0, .Lcoeffs ++ vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]... ++ sub r0, r0, #1 @ where to start writing ++ vld1.32 {d3}, [r3], r1 ++ add r4, r0, r1, lsl #2 ++ vld1.32 {d10}, [r3], r1 ++ vld1.32 {d11}, [r3], r1 ++ vld1.32 {d16}, [r3], r1 ++ vld1.32 {d4}, [r3], r1 ++ vld1.32 {d8}, [r3], r1 ++ vtrn.8 d2, d3 @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]... ++ vld1.32 {d14}, [r3], r1 ++ vld1.32 {d5}, [r3], r1 ++ vtrn.8 d10, d11 @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]... ++ vld1.32 {d6}, [r3], r1 ++ vld1.32 {d12}, [r3], r1 ++ vtrn.8 d16, d4 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]... ++ vld1.32 {d13}, [r3], r1 ++ vtrn.16 d2, d10 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]... ++ vld1.32 {d1}, [r3], r1 ++ vtrn.8 d8, d14 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]... ++ vld1.32 {d7}, [r3], r1 ++ vtrn.16 d3, d11 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]... ++ vld1.32 {d9}, [r3], r1 ++ vtrn.8 d5, d6 @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]... ++ vld1.32 {d15}, [r3] ++ vtrn.16 d16, d8 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]... ++ vtrn.16 d4, d14 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]... ++ vtrn.8 d12, d13 @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]... ++ vdup.16 q9, r2 @ pq ++ vtrn.8 d1, d7 @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]... ++ vtrn.32 d2, d16 @ P1[0..7], P5[0..7] ++ vtrn.16 d5, d12 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]... ++ vtrn.16 d6, d13 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]... ++ vtrn.8 d9, d15 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]... ++ vtrn.32 d3, d4 @ P2[0..7], P6[0..7] ++ vshll.u8 q10, d2, #1 @ 2*P1[0..7] ++ vtrn.32 d10, d8 @ P3[0..7], P7[0..7] ++ vshll.u8 q11, d16, #1 @ 2*P5[0..7] ++ vtrn.32 d11, d14 @ P4[0..7], P8[0..7] ++ vtrn.16 d1, d9 @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]... ++ vtrn.16 d7, d15 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]... ++ vmovl.u8 q1, d3 @ P2[0..7] ++ vmovl.u8 q12, d4 @ P6[0..7] ++ vtrn.32 d5, d1 @ P1[8..15], P5[8..15] ++ vtrn.32 d6, d7 @ P2[8..15], P6[8..15] ++ vtrn.32 d12, d9 @ P3[8..15], P7[8..15] ++ vtrn.32 d13, d15 @ P4[8..15], P8[8..15] ++ vmls.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7] ++ vmovl.u8 q1, d10 @ P3[0..7] ++ vshll.u8 q2, d5, #1 @ 2*P1[8..15] ++ vshll.u8 q13, d1, #1 @ 2*P5[8..15] ++ vmls.i16 q11, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7] ++ vmovl.u8 q14, d6 @ P2[8..15] ++ vmovl.u8 q3, d7 @ P6[8..15] ++ vmovl.u8 q15, d8 @ P7[0..7] ++ vmla.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] ++ vmovl.u8 q1, d12 @ P3[8..15] ++ vmls.i16 q2, q14, d0[1] @ 2*P1[8..15]-5*P2[8..15] ++ vmovl.u8 q4, d9 @ P7[8..15] ++ vshll.u8 q14, d10, #1 @ 2*P3[0..7] ++ vmls.i16 q13, q3, d0[1] @ 2*P5[8..15]-5*P6[8..15] ++ vmovl.u8 q5, d11 @ P4[0..7] ++ vmla.i16 q11, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] ++ vshll.u8 q15, d12, #1 @ 2*P3[8..15] ++ vmovl.u8 q6, d13 @ P4[8..15] ++ vmla.i16 q2, q1, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] ++ vmovl.u8 q1, d14 @ P8[0..7] ++ vmovl.u8 q7, d15 @ P8[8..15] ++ vmla.i16 q13, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] ++ vmovl.u8 q4, d16 @ P5[0..7] ++ vmovl.u8 q8, d1 @ P5[8..15] ++ vmls.i16 q14, q5, d0[1] @ 2*P3[0..7]-5*P4[0..7] ++ vmls.i16 q15, q6, d0[1] @ 2*P3[8..15]-5*P4[8..15] ++ vmls.i16 q10, q5, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] ++ vmls.i16 q11, q1, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] ++ vsub.i16 q1, q5, q4 @ P4[0..7]-P5[0..7] ++ vmls.i16 q2, q6, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] ++ vrshr.s16 q10, q10, #3 ++ vmls.i16 q13, q7, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] ++ vsub.i16 q7, q6, q8 @ P4[8..15]-P5[8..15] ++ vrshr.s16 q11, q11, #3 ++ vmla.s16 q14, q4, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] ++ vrshr.s16 q2, q2, #3 ++ vmla.i16 q15, q8, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] ++ vabs.s16 q10, q10 @ a1[0..7] ++ vrshr.s16 q13, q13, #3 ++ vmls.i16 q15, q3, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] ++ vabs.s16 q3, q11 @ a2[0..7] ++ vabs.s16 q2, q2 @ a1[8..15] ++ vmls.i16 q14, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] ++ vabs.s16 q11, q1 ++ vabs.s16 q12, q13 @ a2[8..15] ++ vcge.s16 q13, q10, q3 @ test a1[0..7] >= a2[0..7] ++ vshr.s16 q1, q1, #8 @ clip_sign[0..7] ++ vrshr.s16 q15, q15, #3 ++ vshr.s16 q11, q11, #1 @ clip[0..7] ++ vrshr.s16 q14, q14, #3 ++ vbsl q13, q3, q10 @ a3[0..7] ++ vcge.s16 q3, q2, q12 @ test a1[8..15] >= a2[8.15] ++ vabs.s16 q10, q15 @ a0[8..15] ++ vshr.s16 q15, q15, #8 @ a0_sign[8..15] ++ vbsl q3, q12, q2 @ a3[8..15] ++ vabs.s16 q2, q14 @ a0[0..7] ++ vabs.s16 q12, q7 ++ vshr.s16 q7, q7, #8 @ clip_sign[8..15] ++ vshr.s16 q14, q14, #8 @ a0_sign[0..7] ++ vshr.s16 q12, q12, #1 @ clip[8..15] ++ vsub.i16 q7, q7, q15 @ clip_sign[8..15] - a0_sign[8..15] ++ vqsub.u16 q15, q10, q3 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q3, q3, q10 @ test a3[8..15] >= a0[8..15] ++ vcge.s16 q10, q10, q9 @ test a0[8..15] >= pq ++ vcge.s16 q9, q2, q9 @ test a0[0..7] >= pq ++ vsub.i16 q1, q1, q14 @ clip_sign[0..7] - a0_sign[0..7] ++ vqsub.u16 q14, q2, q13 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q2, q13, q2 @ test a3[0..7] >= a0[0..7] ++ vmul.i16 q13, q15, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 ++ vceq.i16 q15, q11, #0 @ test clip[0..7] == 0 ++ vmul.i16 q0, q14, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 ++ vorr q9, q15, q9 @ test clip[0..7] == 0 || a0[0..7] >= pq ++ vceq.i16 q14, q12, #0 @ test clip[8..15] == 0 ++ vshr.u16 q13, q13, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 ++ vorr q2, q9, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] ++ vshr.u16 q0, q0, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 ++ vorr q10, q14, q10 @ test clip[8..15] == 0 || a0[8..15] >= pq ++ vcge.s16 q14, q13, q12 ++ vmov.32 r2, d4[1] @ move to gp reg ++ vorr q3, q10, q3 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] ++ vmov.32 r3, d5[1] ++ vcge.s16 q2, q0, q11 ++ vbsl q14, q12, q13 @ FFMIN(d[8..15], clip[8..15]) ++ vbsl q2, q11, q0 @ FFMIN(d[0..7], clip[0..7]) ++ vmov.32 r5, d6[1] ++ vbic q0, q14, q10 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmov.32 r6, d7[1] ++ and r12, r2, r3 ++ vbic q2, q2, q9 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmls.i16 q6, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4 ++ vmls.i16 q5, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4 ++ and r14, r5, r6 ++ vmla.i16 q4, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5 ++ and r12, r12, r14 ++ vqmovun.s16 d4, q6 ++ vmla.i16 q8, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5 ++ tst r12, #1 ++ bne 4f @ none of the 16 pixel pairs should be updated in this case ++ vqmovun.s16 d2, q5 ++ vqmovun.s16 d3, q4 ++ vqmovun.s16 d5, q8 ++ tst r2, #1 ++ bne 1f ++ vst2.8 {d2[0], d3[0]}, [r0], r1 ++ vst2.8 {d2[1], d3[1]}, [r0], r1 ++ vst2.8 {d2[2], d3[2]}, [r0], r1 ++ vst2.8 {d2[3], d3[3]}, [r0] ++1: add r0, r4, r1, lsl #2 ++ tst r3, #1 ++ bne 2f ++ vst2.8 {d2[4], d3[4]}, [r4], r1 ++ vst2.8 {d2[5], d3[5]}, [r4], r1 ++ vst2.8 {d2[6], d3[6]}, [r4], r1 ++ vst2.8 {d2[7], d3[7]}, [r4] ++2: add r4, r0, r1, lsl #2 ++ tst r5, #1 ++ bne 3f ++ vst2.8 {d4[0], d5[0]}, [r0], r1 ++ vst2.8 {d4[1], d5[1]}, [r0], r1 ++ vst2.8 {d4[2], d5[2]}, [r0], r1 ++ vst2.8 {d4[3], d5[3]}, [r0] ++3: tst r6, #1 ++ bne 4f ++ vst2.8 {d4[4], d5[4]}, [r4], r1 ++ vst2.8 {d4[5], d5[5]}, [r4], r1 ++ vst2.8 {d4[6], d5[6]}, [r4], r1 ++ vst2.8 {d4[7], d5[7]}, [r4] ++4: vpop {d8-d15} ++ pop {r4-r6,pc} ++endfunc ++ ++@ Copy at most the specified number of bytes from source to destination buffer, ++@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence ++@ On entry: ++@ r0 -> source buffer ++@ r1 = max number of bytes to copy ++@ r2 -> destination buffer, optimally 8-byte aligned ++@ On exit: ++@ r0 = number of bytes not copied ++function ff_vc1_unescape_buffer_helper_neon, export=1 ++ @ Offset by 48 to screen out cases that are too short for us to handle, ++ @ and also make it easy to test for loop termination, or to determine ++ @ whether we need an odd number of half-iterations of the loop. ++ subs r1, r1, #48 ++ bmi 90f ++ ++ @ Set up useful constants ++ vmov.i32 q0, #0x3000000 ++ vmov.i32 q1, #0x30000 ++ ++ tst r1, #16 ++ bne 1f ++ ++ vld1.8 {q8, q9}, [r0]! ++ vbic q12, q8, q0 ++ vext.8 q13, q8, q9, #1 ++ vext.8 q14, q8, q9, #2 ++ vext.8 q15, q8, q9, #3 ++ veor q12, q12, q1 ++ vbic q13, q13, q0 ++ vbic q14, q14, q0 ++ vbic q15, q15, q0 ++ vceq.i32 q12, q12, #0 ++ veor q13, q13, q1 ++ veor q14, q14, q1 ++ veor q15, q15, q1 ++ vceq.i32 q13, q13, #0 ++ vceq.i32 q14, q14, #0 ++ vceq.i32 q15, q15, #0 ++ add r1, r1, #16 ++ b 3f ++ ++1: vld1.8 {q10, q11}, [r0]! ++ vbic q12, q10, q0 ++ vext.8 q13, q10, q11, #1 ++ vext.8 q14, q10, q11, #2 ++ vext.8 q15, q10, q11, #3 ++ veor q12, q12, q1 ++ vbic q13, q13, q0 ++ vbic q14, q14, q0 ++ vbic q15, q15, q0 ++ vceq.i32 q12, q12, #0 ++ veor q13, q13, q1 ++ veor q14, q14, q1 ++ veor q15, q15, q1 ++ vceq.i32 q13, q13, #0 ++ vceq.i32 q14, q14, #0 ++ vceq.i32 q15, q15, #0 ++ @ Drop through... ++2: vmov q8, q11 ++ vld1.8 {q9}, [r0]! ++ vorr q13, q12, q13 ++ vorr q15, q14, q15 ++ vbic q12, q8, q0 ++ vorr q3, q13, q15 ++ vext.8 q13, q8, q9, #1 ++ vext.8 q14, q8, q9, #2 ++ vext.8 q15, q8, q9, #3 ++ veor q12, q12, q1 ++ vorr d6, d6, d7 ++ vbic q13, q13, q0 ++ vbic q14, q14, q0 ++ vbic q15, q15, q0 ++ vceq.i32 q12, q12, #0 ++ vmov r3, r12, d6 ++ veor q13, q13, q1 ++ veor q14, q14, q1 ++ veor q15, q15, q1 ++ vceq.i32 q13, q13, #0 ++ vceq.i32 q14, q14, #0 ++ vceq.i32 q15, q15, #0 ++ orrs r3, r3, r12 ++ bne 90f ++ vst1.64 {q10}, [r2]! ++3: vmov q10, q9 ++ vld1.8 {q11}, [r0]! ++ vorr q13, q12, q13 ++ vorr q15, q14, q15 ++ vbic q12, q10, q0 ++ vorr q3, q13, q15 ++ vext.8 q13, q10, q11, #1 ++ vext.8 q14, q10, q11, #2 ++ vext.8 q15, q10, q11, #3 ++ veor q12, q12, q1 ++ vorr d6, d6, d7 ++ vbic q13, q13, q0 ++ vbic q14, q14, q0 ++ vbic q15, q15, q0 ++ vceq.i32 q12, q12, #0 ++ vmov r3, r12, d6 ++ veor q13, q13, q1 ++ veor q14, q14, q1 ++ veor q15, q15, q1 ++ vceq.i32 q13, q13, #0 ++ vceq.i32 q14, q14, #0 ++ vceq.i32 q15, q15, #0 ++ orrs r3, r3, r12 ++ bne 91f ++ vst1.64 {q8}, [r2]! ++ subs r1, r1, #32 ++ bpl 2b ++ ++90: add r0, r1, #48 ++ bx lr ++ ++91: sub r1, r1, #16 ++ b 90b ++endfunc +diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h +index 8a71c04230..53644506e5 100644 +--- a/libavcodec/avcodec.h ++++ b/libavcodec/avcodec.h +@@ -2595,6 +2595,17 @@ typedef struct AVHWAccel { + * that avctx->hwaccel_priv_data is invalid. + */ + int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx); ++ ++ /** ++ * Called if parsing fails ++ * ++ * An error has occured, end_frame will not be called ++ * start_frame & decode_slice may or may not have been called ++ * Optional ++ * ++ * @param avctx the codec context ++ */ ++ void (*abort_frame)(AVCodecContext *avctx); + } AVHWAccel; + + /** +diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h +index 38d06b2842..bbf5d70560 100644 +--- a/libavcodec/cabac.h ++++ b/libavcodec/cabac.h +@@ -44,6 +44,10 @@ typedef struct CABACContext{ + const uint8_t *bytestream_start; + const uint8_t *bytestream; + const uint8_t *bytestream_end; ++ struct { ++ uint16_t bits; ++ uint16_t range; ++ } by22; + }CABACContext; + + int ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size); +diff --git a/libavcodec/codec.h b/libavcodec/codec.h +index 50a22f6e3c..5acf572ef4 100644 +--- a/libavcodec/codec.h ++++ b/libavcodec/codec.h +@@ -367,6 +367,17 @@ const AVCodec *av_codec_iterate(void **opaque); + */ + AVCodec *avcodec_find_decoder(enum AVCodecID id); + ++/** ++ * Find a registered decoder with a matching codec ID and pix_fmt. ++ * A decoder will pix_fmt set to NULL will match any fmt. ++ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL. ++ * ++ * @param id AVCodecID of the requested decoder ++ * @param fmt AVPixelForma that msut be supported by decoder ++ * @return A decoder if one was found, NULL otherwise. ++ */ ++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt); ++ + /** + * Find a registered decoder with the specified name. + * +diff --git a/libavcodec/hevc-ctrls-v1.h b/libavcodec/hevc-ctrls-v1.h +new file mode 100644 +index 0000000000..72cbba0953 +--- /dev/null ++++ b/libavcodec/hevc-ctrls-v1.h +@@ -0,0 +1,229 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * These are the HEVC state controls for use with stateless HEVC ++ * codec drivers. ++ * ++ * It turns out that these structs are not stable yet and will undergo ++ * more changes. So keep them private until they are stable and ready to ++ * become part of the official public API. ++ */ ++ ++#ifndef _HEVC_CTRLS_H_ ++#define _HEVC_CTRLS_H_ ++ ++#include ++ ++/* The pixel format isn't stable at the moment and will likely be renamed. */ ++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ ++ ++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_MPEG_BASE + 1008) ++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_MPEG_BASE + 1009) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_MPEG_BASE + 1010) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_MPEG_BASE + 1011) ++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_MPEG_BASE + 1015) ++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_MPEG_BASE + 1016) ++ ++/* enum v4l2_ctrl_type type values */ ++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 ++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 ++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 ++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 ++ ++enum v4l2_mpeg_video_hevc_decode_mode { ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, ++}; ++ ++enum v4l2_mpeg_video_hevc_start_code { ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, ++}; ++ ++#define V4L2_HEVC_SLICE_TYPE_B 0 ++#define V4L2_HEVC_SLICE_TYPE_P 1 ++#define V4L2_HEVC_SLICE_TYPE_I 2 ++ ++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) ++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) ++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) ++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) ++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) ++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) ++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) ++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) ++ ++/* The controls are not stable at the moment and will likely be reworked. */ ++struct v4l2_ctrl_hevc_sps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ ++ __u16 pic_width_in_luma_samples; ++ __u16 pic_height_in_luma_samples; ++ __u8 bit_depth_luma_minus8; ++ __u8 bit_depth_chroma_minus8; ++ __u8 log2_max_pic_order_cnt_lsb_minus4; ++ __u8 sps_max_dec_pic_buffering_minus1; ++ __u8 sps_max_num_reorder_pics; ++ __u8 sps_max_latency_increase_plus1; ++ __u8 log2_min_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_luma_coding_block_size; ++ __u8 log2_min_luma_transform_block_size_minus2; ++ __u8 log2_diff_max_min_luma_transform_block_size; ++ __u8 max_transform_hierarchy_depth_inter; ++ __u8 max_transform_hierarchy_depth_intra; ++ __u8 pcm_sample_bit_depth_luma_minus1; ++ __u8 pcm_sample_bit_depth_chroma_minus1; ++ __u8 log2_min_pcm_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_pcm_luma_coding_block_size; ++ __u8 num_short_term_ref_pic_sets; ++ __u8 num_long_term_ref_pics_sps; ++ __u8 chroma_format_idc; ++ __u8 sps_max_sub_layers_minus1; ++ ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 0) ++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) ++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) ++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) ++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) ++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) ++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) ++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) ++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) ++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) ++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) ++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) ++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) ++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) ++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) ++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) ++ ++struct v4l2_ctrl_hevc_pps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ ++ __u8 num_extra_slice_header_bits; ++ __s8 init_qp_minus26; ++ __u8 diff_cu_qp_delta_depth; ++ __s8 pps_cb_qp_offset; ++ __s8 pps_cr_qp_offset; ++ __u8 num_tile_columns_minus1; ++ __u8 num_tile_rows_minus1; ++ __u8 column_width_minus1[20]; ++ __u8 row_height_minus1[22]; ++ __s8 pps_beta_offset_div2; ++ __s8 pps_tc_offset_div2; ++ __u8 log2_parallel_merge_level_minus2; ++ ++ __u8 padding[4]; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01 ++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02 ++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03 ++ ++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 ++ ++struct v4l2_hevc_dpb_entry { ++ __u64 timestamp; ++ __u8 rps; ++ __u8 field_pic; ++ __u16 pic_order_cnt[2]; ++ __u8 padding[2]; ++}; ++ ++struct v4l2_hevc_pred_weight_table { ++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __u8 padding[6]; ++ ++ __u8 luma_log2_weight_denom; ++ __s8 delta_chroma_log2_weight_denom; ++}; ++ ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) ++ ++struct v4l2_ctrl_hevc_slice_params { ++ __u32 bit_size; ++ __u32 data_bit_offset; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u32 slice_segment_addr; ++ __u32 num_entry_point_offsets; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ ++ __u8 nal_unit_type; ++ __u8 nuh_temporal_id_plus1; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 slice_type; ++ __u8 colour_plane_id; ++ __u16 slice_pic_order_cnt; ++ __u8 num_ref_idx_l0_active_minus1; ++ __u8 num_ref_idx_l1_active_minus1; ++ __u8 collocated_ref_idx; ++ __u8 five_minus_max_num_merge_cand; ++ __s8 slice_qp_delta; ++ __s8 slice_cb_qp_offset; ++ __s8 slice_cr_qp_offset; ++ __s8 slice_act_y_qp_offset; ++ __s8 slice_act_cb_qp_offset; ++ __s8 slice_act_cr_qp_offset; ++ __s8 slice_beta_offset_div2; ++ __s8 slice_tc_offset_div2; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ ++ __u8 pic_struct; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 num_active_dpb_entries; ++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ ++ __u8 num_rps_poc_st_curr_before; ++ __u8 num_rps_poc_st_curr_after; ++ __u8 num_rps_poc_lt_curr; ++ ++ __u8 padding; ++ ++ __u32 entry_point_offset_minus1[256]; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ ++ struct v4l2_hevc_pred_weight_table pred_weight_table; ++ ++ __u64 flags; ++}; ++ ++struct v4l2_ctrl_hevc_scaling_matrix { ++ __u8 scaling_list_4x4[6][16]; ++ __u8 scaling_list_8x8[6][64]; ++ __u8 scaling_list_16x16[6][64]; ++ __u8 scaling_list_32x32[2][64]; ++ __u8 scaling_list_dc_coef_16x16[6]; ++ __u8 scaling_list_dc_coef_32x32[2]; ++}; ++ ++#endif +diff --git a/libavcodec/hevc-ctrls-v2.h b/libavcodec/hevc-ctrls-v2.h +new file mode 100644 +index 0000000000..7cbbbf055f +--- /dev/null ++++ b/libavcodec/hevc-ctrls-v2.h +@@ -0,0 +1,257 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * These are the HEVC state controls for use with stateless HEVC ++ * codec drivers. ++ * ++ * It turns out that these structs are not stable yet and will undergo ++ * more changes. So keep them private until they are stable and ready to ++ * become part of the official public API. ++ */ ++ ++#ifndef _HEVC_CTRLS_H_ ++#define _HEVC_CTRLS_H_ ++ ++#include ++ ++/* The pixel format isn't stable at the moment and will likely be renamed. */ ++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ ++ ++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008) ++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011) ++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012) ++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015) ++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016) ++ ++/* enum v4l2_ctrl_type type values */ ++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 ++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 ++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 ++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 ++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124 ++ ++enum v4l2_mpeg_video_hevc_decode_mode { ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, ++}; ++ ++enum v4l2_mpeg_video_hevc_start_code { ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, ++}; ++ ++#define V4L2_HEVC_SLICE_TYPE_B 0 ++#define V4L2_HEVC_SLICE_TYPE_P 1 ++#define V4L2_HEVC_SLICE_TYPE_I 2 ++ ++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) ++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) ++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) ++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) ++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) ++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) ++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) ++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) ++ ++/* The controls are not stable at the moment and will likely be reworked. */ ++struct v4l2_ctrl_hevc_sps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ ++ __u16 pic_width_in_luma_samples; ++ __u16 pic_height_in_luma_samples; ++ __u8 bit_depth_luma_minus8; ++ __u8 bit_depth_chroma_minus8; ++ __u8 log2_max_pic_order_cnt_lsb_minus4; ++ __u8 sps_max_dec_pic_buffering_minus1; ++ __u8 sps_max_num_reorder_pics; ++ __u8 sps_max_latency_increase_plus1; ++ __u8 log2_min_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_luma_coding_block_size; ++ __u8 log2_min_luma_transform_block_size_minus2; ++ __u8 log2_diff_max_min_luma_transform_block_size; ++ __u8 max_transform_hierarchy_depth_inter; ++ __u8 max_transform_hierarchy_depth_intra; ++ __u8 pcm_sample_bit_depth_luma_minus1; ++ __u8 pcm_sample_bit_depth_chroma_minus1; ++ __u8 log2_min_pcm_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_pcm_luma_coding_block_size; ++ __u8 num_short_term_ref_pic_sets; ++ __u8 num_long_term_ref_pics_sps; ++ __u8 chroma_format_idc; ++ __u8 sps_max_sub_layers_minus1; ++ ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) ++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) ++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) ++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) ++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) ++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) ++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) ++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) ++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) ++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) ++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) ++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) ++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) ++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) ++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) ++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) ++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) ++ ++struct v4l2_ctrl_hevc_pps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ ++ __u8 num_extra_slice_header_bits; ++ __u8 num_ref_idx_l0_default_active_minus1; ++ __u8 num_ref_idx_l1_default_active_minus1; ++ __s8 init_qp_minus26; ++ __u8 diff_cu_qp_delta_depth; ++ __s8 pps_cb_qp_offset; ++ __s8 pps_cr_qp_offset; ++ __u8 num_tile_columns_minus1; ++ __u8 num_tile_rows_minus1; ++ __u8 column_width_minus1[20]; ++ __u8 row_height_minus1[22]; ++ __s8 pps_beta_offset_div2; ++ __s8 pps_tc_offset_div2; ++ __u8 log2_parallel_merge_level_minus2; ++ ++ __u8 padding[4]; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01 ++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02 ++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03 ++ ++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 ++ ++struct v4l2_hevc_dpb_entry { ++ __u64 timestamp; ++ __u8 rps; ++ __u8 field_pic; ++ __u16 pic_order_cnt[2]; ++ __u8 padding[2]; ++}; ++ ++struct v4l2_hevc_pred_weight_table { ++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __u8 padding[6]; ++ ++ __u8 luma_log2_weight_denom; ++ __s8 delta_chroma_log2_weight_denom; ++}; ++ ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) ++ ++struct v4l2_ctrl_hevc_slice_params { ++ __u32 bit_size; ++ __u32 data_bit_offset; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u32 slice_segment_addr; ++ __u32 num_entry_point_offsets; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ ++ __u8 nal_unit_type; ++ __u8 nuh_temporal_id_plus1; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 slice_type; ++ __u8 colour_plane_id; ++ __u16 slice_pic_order_cnt; ++ __u8 num_ref_idx_l0_active_minus1; ++ __u8 num_ref_idx_l1_active_minus1; ++ __u8 collocated_ref_idx; ++ __u8 five_minus_max_num_merge_cand; ++ __s8 slice_qp_delta; ++ __s8 slice_cb_qp_offset; ++ __s8 slice_cr_qp_offset; ++ __s8 slice_act_y_qp_offset; ++ __s8 slice_act_cb_qp_offset; ++ __s8 slice_act_cr_qp_offset; ++ __s8 slice_beta_offset_div2; ++ __s8 slice_tc_offset_div2; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ ++ __u8 pic_struct; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ ++ __u8 padding[5]; ++ ++ __u32 entry_point_offset_minus1[256]; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ ++ struct v4l2_hevc_pred_weight_table pred_weight_table; ++ ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 ++ ++struct v4l2_ctrl_hevc_decode_params { ++ __s32 pic_order_cnt_val; ++ __u8 num_active_dpb_entries; ++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 num_poc_st_curr_before; ++ __u8 num_poc_st_curr_after; ++ __u8 num_poc_lt_curr; ++ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u64 flags; ++}; ++ ++/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */ ++#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200) ++/* ++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP - ++ * the number of data (in bits) to skip in the ++ * slice segment header. ++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag" ++ * to before syntax element "slice_temporal_mvp_enabled_flag". ++ * If IDR, the skipped bits are just "pic_output_flag" ++ * (separate_colour_plane_flag is not supported). ++ */ ++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0) ++ ++struct v4l2_ctrl_hevc_scaling_matrix { ++ __u8 scaling_list_4x4[6][16]; ++ __u8 scaling_list_8x8[6][64]; ++ __u8 scaling_list_16x16[6][64]; ++ __u8 scaling_list_32x32[2][64]; ++ __u8 scaling_list_dc_coef_16x16[6]; ++ __u8 scaling_list_dc_coef_32x32[2]; ++}; ++ ++#endif +diff --git a/libavcodec/hevc-ctrls-v3.h b/libavcodec/hevc-ctrls-v3.h +new file mode 100644 +index 0000000000..4e35bd583d +--- /dev/null ++++ b/libavcodec/hevc-ctrls-v3.h +@@ -0,0 +1,255 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * These are the HEVC state controls for use with stateless HEVC ++ * codec drivers. ++ * ++ * It turns out that these structs are not stable yet and will undergo ++ * more changes. So keep them private until they are stable and ready to ++ * become part of the official public API. ++ */ ++ ++#ifndef _HEVC_CTRLS_H_ ++#define _HEVC_CTRLS_H_ ++ ++#include ++ ++/* The pixel format isn't stable at the moment and will likely be renamed. */ ++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ ++ ++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008) ++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011) ++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012) ++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015) ++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016) ++ ++/* enum v4l2_ctrl_type type values */ ++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 ++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 ++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 ++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 ++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124 ++ ++enum v4l2_mpeg_video_hevc_decode_mode { ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, ++}; ++ ++enum v4l2_mpeg_video_hevc_start_code { ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, ++}; ++ ++#define V4L2_HEVC_SLICE_TYPE_B 0 ++#define V4L2_HEVC_SLICE_TYPE_P 1 ++#define V4L2_HEVC_SLICE_TYPE_I 2 ++ ++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) ++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) ++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) ++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) ++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) ++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) ++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) ++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) ++ ++/* The controls are not stable at the moment and will likely be reworked. */ ++struct v4l2_ctrl_hevc_sps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ ++ __u16 pic_width_in_luma_samples; ++ __u16 pic_height_in_luma_samples; ++ __u8 bit_depth_luma_minus8; ++ __u8 bit_depth_chroma_minus8; ++ __u8 log2_max_pic_order_cnt_lsb_minus4; ++ __u8 sps_max_dec_pic_buffering_minus1; ++ __u8 sps_max_num_reorder_pics; ++ __u8 sps_max_latency_increase_plus1; ++ __u8 log2_min_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_luma_coding_block_size; ++ __u8 log2_min_luma_transform_block_size_minus2; ++ __u8 log2_diff_max_min_luma_transform_block_size; ++ __u8 max_transform_hierarchy_depth_inter; ++ __u8 max_transform_hierarchy_depth_intra; ++ __u8 pcm_sample_bit_depth_luma_minus1; ++ __u8 pcm_sample_bit_depth_chroma_minus1; ++ __u8 log2_min_pcm_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_pcm_luma_coding_block_size; ++ __u8 num_short_term_ref_pic_sets; ++ __u8 num_long_term_ref_pics_sps; ++ __u8 chroma_format_idc; ++ __u8 sps_max_sub_layers_minus1; ++ ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) ++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) ++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) ++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) ++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) ++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) ++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) ++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) ++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) ++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) ++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) ++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) ++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) ++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) ++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) ++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) ++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) ++ ++struct v4l2_ctrl_hevc_pps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ ++ __u8 num_extra_slice_header_bits; ++ __u8 num_ref_idx_l0_default_active_minus1; ++ __u8 num_ref_idx_l1_default_active_minus1; ++ __s8 init_qp_minus26; ++ __u8 diff_cu_qp_delta_depth; ++ __s8 pps_cb_qp_offset; ++ __s8 pps_cr_qp_offset; ++ __u8 num_tile_columns_minus1; ++ __u8 num_tile_rows_minus1; ++ __u8 column_width_minus1[20]; ++ __u8 row_height_minus1[22]; ++ __s8 pps_beta_offset_div2; ++ __s8 pps_tc_offset_div2; ++ __u8 log2_parallel_merge_level_minus2; ++ ++ __u8 padding[4]; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01 ++ ++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 ++ ++struct v4l2_hevc_dpb_entry { ++ __u64 timestamp; ++ __u8 flags; ++ __u8 field_pic; ++ __u16 pic_order_cnt[2]; ++ __u8 padding[2]; ++}; ++ ++struct v4l2_hevc_pred_weight_table { ++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __u8 padding[6]; ++ ++ __u8 luma_log2_weight_denom; ++ __s8 delta_chroma_log2_weight_denom; ++}; ++ ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) ++ ++struct v4l2_ctrl_hevc_slice_params { ++ __u32 bit_size; ++ __u32 data_bit_offset; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u32 slice_segment_addr; ++ __u32 num_entry_point_offsets; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ ++ __u8 nal_unit_type; ++ __u8 nuh_temporal_id_plus1; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 slice_type; ++ __u8 colour_plane_id; ++ __u16 slice_pic_order_cnt; ++ __u8 num_ref_idx_l0_active_minus1; ++ __u8 num_ref_idx_l1_active_minus1; ++ __u8 collocated_ref_idx; ++ __u8 five_minus_max_num_merge_cand; ++ __s8 slice_qp_delta; ++ __s8 slice_cb_qp_offset; ++ __s8 slice_cr_qp_offset; ++ __s8 slice_act_y_qp_offset; ++ __s8 slice_act_cb_qp_offset; ++ __s8 slice_act_cr_qp_offset; ++ __s8 slice_beta_offset_div2; ++ __s8 slice_tc_offset_div2; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ ++ __u8 pic_struct; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ ++ __u8 padding[5]; ++ ++ __u32 entry_point_offset_minus1[256]; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ ++ struct v4l2_hevc_pred_weight_table pred_weight_table; ++ ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 ++ ++struct v4l2_ctrl_hevc_decode_params { ++ __s32 pic_order_cnt_val; ++ __u8 num_active_dpb_entries; ++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 num_poc_st_curr_before; ++ __u8 num_poc_st_curr_after; ++ __u8 num_poc_lt_curr; ++ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u64 flags; ++}; ++ ++struct v4l2_ctrl_hevc_scaling_matrix { ++ __u8 scaling_list_4x4[6][16]; ++ __u8 scaling_list_8x8[6][64]; ++ __u8 scaling_list_16x16[6][64]; ++ __u8 scaling_list_32x32[2][64]; ++ __u8 scaling_list_dc_coef_16x16[6]; ++ __u8 scaling_list_dc_coef_32x32[2]; ++}; ++ ++/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */ ++#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200) ++/* ++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP - ++ * the number of data (in bits) to skip in the ++ * slice segment header. ++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag" ++ * to before syntax element "slice_temporal_mvp_enabled_flag". ++ * If IDR, the skipped bits are just "pic_output_flag" ++ * (separate_colour_plane_flag is not supported). ++ */ ++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0) ++ ++#endif +diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h +new file mode 100644 +index 0000000000..c02fdbe5a8 +--- /dev/null ++++ b/libavcodec/hevc-ctrls-v4.h +@@ -0,0 +1,524 @@ ++/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */ ++/* ++ * Video for Linux Two controls header file ++ * ++ * Copyright (C) 1999-2012 the contributors ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * Alternatively you can redistribute this file under the terms of the ++ * BSD license as stated below: ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * 3. The names of its contributors may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED ++ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * The contents of this header was split off from videodev2.h. All control ++ * definitions should be added to this header, which is included by ++ * videodev2.h. ++ */ ++ ++#ifndef AVCODEC_HEVC_CTRLS_V4_H ++#define AVCODEC_HEVC_CTRLS_V4_H ++ ++#include ++#include ++ ++#ifndef V4L2_CTRL_CLASS_CODEC_STATELESS ++#define V4L2_CTRL_CLASS_CODEC_STATELESS 0x00a40000 /* Stateless codecs controls */ ++#endif ++#ifndef V4L2_CID_CODEC_STATELESS_BASE ++#define V4L2_CID_CODEC_STATELESS_BASE (V4L2_CTRL_CLASS_CODEC_STATELESS | 0x900) ++#endif ++ ++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ ++ ++#define V4L2_CID_STATELESS_HEVC_SPS (V4L2_CID_CODEC_STATELESS_BASE + 400) ++#define V4L2_CID_STATELESS_HEVC_PPS (V4L2_CID_CODEC_STATELESS_BASE + 401) ++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 402) ++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_STATELESS_BASE + 403) ++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 404) ++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE (V4L2_CID_CODEC_STATELESS_BASE + 405) ++#define V4L2_CID_STATELESS_HEVC_START_CODE (V4L2_CID_CODEC_STATELESS_BASE + 406) ++#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407) ++ ++enum v4l2_stateless_hevc_decode_mode { ++ V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED, ++ V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED, ++}; ++ ++enum v4l2_stateless_hevc_start_code { ++ V4L2_STATELESS_HEVC_START_CODE_NONE, ++ V4L2_STATELESS_HEVC_START_CODE_ANNEX_B, ++}; ++ ++#define V4L2_HEVC_SLICE_TYPE_B 0 ++#define V4L2_HEVC_SLICE_TYPE_P 1 ++#define V4L2_HEVC_SLICE_TYPE_I 2 ++ ++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) ++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) ++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) ++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) ++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) ++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) ++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) ++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) ++ ++/** ++ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set ++ * ++ * @video_parameter_set_id: specifies the value of the ++ * vps_video_parameter_set_id of the active VPS ++ * @seq_parameter_set_id: provides an identifier for the SPS for ++ * reference by other syntax elements ++ * @pic_width_in_luma_samples: specifies the width of each decoded picture ++ * in units of luma samples ++ * @pic_height_in_luma_samples: specifies the height of each decoded picture ++ * in units of luma samples ++ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the ++ * samples of the luma array ++ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the ++ * samples of the chroma arrays ++ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of ++ * the variable MaxPicOrderCntLsb ++ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum ++ * required size of the decoded picture ++ * buffer for the codec video sequence ++ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures ++ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the ++ * value of SpsMaxLatencyPictures array ++ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum ++ * luma coding block size ++ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between ++ * the maximum and minimum luma ++ * coding block size ++ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma ++ * transform block size ++ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between ++ * the maximum and minimum luma ++ * transform block size ++ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy ++ * depth for transform units of ++ * coding units coded in inter ++ * prediction mode ++ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy ++ * depth for transform units of ++ * coding units coded in intra ++ * prediction mode ++ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of ++ * bits used to represent each of PCM sample ++ * values of the luma component ++ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number ++ * of bits used to represent each of PCM ++ * sample values of the chroma components ++ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the ++ * minimum size of coding blocks ++ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between ++ * the maximum and minimum size of ++ * coding blocks ++ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set() ++ * syntax structures included in the SPS ++ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term ++ * reference pictures that are specified in the SPS ++ * @chroma_format_idc: specifies the chroma sampling ++ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number ++ * of temporal sub-layers ++ * @reserved: padding field. Should be zeroed by applications. ++ * @flags: see V4L2_HEVC_SPS_FLAG_{} ++ */ ++struct v4l2_ctrl_hevc_sps { ++ __u8 video_parameter_set_id; ++ __u8 seq_parameter_set_id; ++ __u16 pic_width_in_luma_samples; ++ __u16 pic_height_in_luma_samples; ++ __u8 bit_depth_luma_minus8; ++ __u8 bit_depth_chroma_minus8; ++ __u8 log2_max_pic_order_cnt_lsb_minus4; ++ __u8 sps_max_dec_pic_buffering_minus1; ++ __u8 sps_max_num_reorder_pics; ++ __u8 sps_max_latency_increase_plus1; ++ __u8 log2_min_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_luma_coding_block_size; ++ __u8 log2_min_luma_transform_block_size_minus2; ++ __u8 log2_diff_max_min_luma_transform_block_size; ++ __u8 max_transform_hierarchy_depth_inter; ++ __u8 max_transform_hierarchy_depth_intra; ++ __u8 pcm_sample_bit_depth_luma_minus1; ++ __u8 pcm_sample_bit_depth_chroma_minus1; ++ __u8 log2_min_pcm_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_pcm_luma_coding_block_size; ++ __u8 num_short_term_ref_pic_sets; ++ __u8 num_long_term_ref_pics_sps; ++ __u8 chroma_format_idc; ++ __u8 sps_max_sub_layers_minus1; ++ ++ __u8 reserved[6]; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) ++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) ++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) ++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) ++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) ++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) ++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) ++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) ++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) ++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) ++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) ++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) ++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) ++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) ++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) ++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) ++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) ++ ++/** ++ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set ++ * ++ * @pic_parameter_set_id: identifies the PPS for reference by other ++ * syntax elements ++ * @num_extra_slice_header_bits: specifies the number of extra slice header ++ * bits that are present in the slice header RBSP ++ * for coded pictures referring to the PPS. ++ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the ++ * inferred value of num_ref_idx_l0_active_minus1 ++ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the ++ * inferred value of num_ref_idx_l1_active_minus1 ++ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for ++ * each slice referring to the PPS ++ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding ++ * tree block size and the minimum luma coding block ++ * size of coding units that convey cu_qp_delta_abs ++ * and cu_qp_delta_sign_flag ++ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb ++ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr ++ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns ++ * partitioning the picture ++ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning ++ * the picture ++ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in ++ * units of coding tree blocks ++ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in ++ * units of coding tree blocks ++ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for ++ * beta divided by 2 ++ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC ++ * divided by 2 ++ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of ++ * the variable Log2ParMrgLevel ++ * @reserved: padding field. Should be zeroed by applications. ++ * @flags: see V4L2_HEVC_PPS_FLAG_{} ++ */ ++struct v4l2_ctrl_hevc_pps { ++ __u8 pic_parameter_set_id; ++ __u8 num_extra_slice_header_bits; ++ __u8 num_ref_idx_l0_default_active_minus1; ++ __u8 num_ref_idx_l1_default_active_minus1; ++ __s8 init_qp_minus26; ++ __u8 diff_cu_qp_delta_depth; ++ __s8 pps_cb_qp_offset; ++ __s8 pps_cr_qp_offset; ++ __u8 num_tile_columns_minus1; ++ __u8 num_tile_rows_minus1; ++ __u8 column_width_minus1[20]; ++ __u8 row_height_minus1[22]; ++ __s8 pps_beta_offset_div2; ++ __s8 pps_tc_offset_div2; ++ __u8 log2_parallel_merge_level_minus2; ++ __u8 reserved; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01 ++ ++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME 0 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD 1 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD 2 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM 3 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP 4 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP 5 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM 6 ++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING 7 ++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING 8 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM 9 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP 10 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM 11 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP 12 ++ ++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 ++ ++/** ++ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry ++ * ++ * @timestamp: timestamp of the V4L2 capture buffer to use as reference. ++ * @flags: long term flag for the reference frame ++ * @field_pic: whether the reference is a field picture or a frame. ++ * @reserved: padding field. Should be zeroed by applications. ++ * @pic_order_cnt_val: the picture order count of the current picture. ++ */ ++struct v4l2_hevc_dpb_entry { ++ __u64 timestamp; ++ __u8 flags; ++ __u8 field_pic; ++ __u16 reserved; ++ __s32 pic_order_cnt_val; ++}; ++ ++/** ++ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters ++ * ++ * @delta_luma_weight_l0: the difference of the weighting factor applied ++ * to the luma prediction value for list 0 ++ * @luma_offset_l0: the additive offset applied to the luma prediction value ++ * for list 0 ++ * @delta_chroma_weight_l0: the difference of the weighting factor applied ++ * to the chroma prediction values for list 0 ++ * @chroma_offset_l0: the difference of the additive offset applied to ++ * the chroma prediction values for list 0 ++ * @delta_luma_weight_l1: the difference of the weighting factor applied ++ * to the luma prediction value for list 1 ++ * @luma_offset_l1: the additive offset applied to the luma prediction value ++ * for list 1 ++ * @delta_chroma_weight_l1: the difference of the weighting factor applied ++ * to the chroma prediction values for list 1 ++ * @chroma_offset_l1: the difference of the additive offset applied to ++ * the chroma prediction values for list 1 ++ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for ++ * all luma weighting factors ++ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm ++ * of the denominator for all chroma ++ * weighting factors ++ */ ++struct v4l2_hevc_pred_weight_table { ++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __u8 luma_log2_weight_denom; ++ __s8 delta_chroma_log2_weight_denom; ++}; ++ ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) ++ ++/** ++ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters ++ * ++ * This control is a dynamically sized 1-dimensional array, ++ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it. ++ * ++ * @bit_size: size (in bits) of the current slice data ++ * @data_byte_offset: offset (in bytes) to the video data in the current slice data ++ * @num_entry_point_offsets: specifies the number of entry point offset syntax ++ * elements in the slice header. ++ * @nal_unit_type: specifies the coding type of the slice (B, P or I) ++ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit ++ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{} ++ * @colour_plane_id: specifies the colour plane associated with the current slice ++ * @slice_pic_order_cnt: specifies the picture order count ++ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum ++ * reference index for reference picture list 0 ++ * that may be used to decode the slice ++ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum ++ * reference index for reference picture list 1 ++ * that may be used to decode the slice ++ * @collocated_ref_idx: specifies the reference index of the collocated picture used ++ * for temporal motion vector prediction ++ * @five_minus_max_num_merge_cand: specifies the maximum number of merging ++ * motion vector prediction candidates supported in ++ * the slice subtracted from 5 ++ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding ++ * blocks in the slice ++ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset ++ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset ++ * @slice_act_y_qp_offset: screen content extension parameters ++ * @slice_act_cb_qp_offset: screen content extension parameters ++ * @slice_act_cr_qp_offset: screen content extension parameters ++ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2 ++ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2 ++ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or ++ * more fields ++ * @reserved0: padding field. Should be zeroed by applications. ++ * @slice_segment_addr: specifies the address of the first coding tree block in ++ * the slice segment ++ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB ++ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB ++ * @short_term_ref_pic_set_size: specifies the size of short-term reference ++ * pictures set included in the SPS ++ * @long_term_ref_pic_set_size: specifies the size of long-term reference ++ * pictures set include in the SPS ++ * @pred_weight_table: the prediction weight coefficients for inter-picture ++ * prediction ++ * @reserved1: padding field. Should be zeroed by applications. ++ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{} ++ */ ++struct v4l2_ctrl_hevc_slice_params { ++ __u32 bit_size; ++ __u32 data_byte_offset; ++ __u32 num_entry_point_offsets; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ ++ __u8 nal_unit_type; ++ __u8 nuh_temporal_id_plus1; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 slice_type; ++ __u8 colour_plane_id; ++ __s32 slice_pic_order_cnt; ++ __u8 num_ref_idx_l0_active_minus1; ++ __u8 num_ref_idx_l1_active_minus1; ++ __u8 collocated_ref_idx; ++ __u8 five_minus_max_num_merge_cand; ++ __s8 slice_qp_delta; ++ __s8 slice_cb_qp_offset; ++ __s8 slice_cr_qp_offset; ++ __s8 slice_act_y_qp_offset; ++ __s8 slice_act_cb_qp_offset; ++ __s8 slice_act_cr_qp_offset; ++ __s8 slice_beta_offset_div2; ++ __s8 slice_tc_offset_div2; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ ++ __u8 pic_struct; ++ ++ __u8 reserved0[3]; ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u32 slice_segment_addr; ++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u16 short_term_ref_pic_set_size; ++ __u16 long_term_ref_pic_set_size; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ ++ struct v4l2_hevc_pred_weight_table pred_weight_table; ++ ++ __u8 reserved1[2]; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 ++ ++/** ++ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters ++ * ++ * @pic_order_cnt_val: picture order count ++ * @short_term_ref_pic_set_size: specifies the size of short-term reference ++ * pictures set included in the SPS of the first slice ++ * @long_term_ref_pic_set_size: specifies the size of long-term reference ++ * pictures set include in the SPS of the first slice ++ * @num_active_dpb_entries: the number of entries in dpb ++ * @num_poc_st_curr_before: the number of reference pictures in the short-term ++ * set that come before the current frame ++ * @num_poc_st_curr_after: the number of reference pictures in the short-term ++ * set that come after the current frame ++ * @num_poc_lt_curr: the number of reference pictures in the long-term set ++ * @poc_st_curr_before: provides the index of the short term before references ++ * in DPB array ++ * @poc_st_curr_after: provides the index of the short term after references ++ * in DPB array ++ * @poc_lt_curr: provides the index of the long term references in DPB array ++ * @reserved: padding field. Should be zeroed by applications. ++ * @dpb: the decoded picture buffer, for meta-data about reference frames ++ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{} ++ */ ++struct v4l2_ctrl_hevc_decode_params { ++ __s32 pic_order_cnt_val; ++ __u16 short_term_ref_pic_set_size; ++ __u16 long_term_ref_pic_set_size; ++ __u8 num_active_dpb_entries; ++ __u8 num_poc_st_curr_before; ++ __u8 num_poc_st_curr_after; ++ __u8 num_poc_lt_curr; ++ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 reserved[4]; ++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u64 flags; ++}; ++ ++/** ++ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters ++ * ++ * @scaling_list_4x4: scaling list is used for the scaling process for ++ * transform coefficients. The values on each scaling ++ * list are expected in raster scan order ++ * @scaling_list_8x8: scaling list is used for the scaling process for ++ * transform coefficients. The values on each scaling ++ * list are expected in raster scan order ++ * @scaling_list_16x16: scaling list is used for the scaling process for ++ * transform coefficients. The values on each scaling ++ * list are expected in raster scan order ++ * @scaling_list_32x32: scaling list is used for the scaling process for ++ * transform coefficients. The values on each scaling ++ * list are expected in raster scan order ++ * @scaling_list_dc_coef_16x16: scaling list is used for the scaling process ++ * for transform coefficients. The values on each ++ * scaling list are expected in raster scan order. ++ * @scaling_list_dc_coef_32x32: scaling list is used for the scaling process ++ * for transform coefficients. The values on each ++ * scaling list are expected in raster scan order. ++ */ ++struct v4l2_ctrl_hevc_scaling_matrix { ++ __u8 scaling_list_4x4[6][16]; ++ __u8 scaling_list_8x8[6][64]; ++ __u8 scaling_list_16x16[6][64]; ++ __u8 scaling_list_32x32[2][64]; ++ __u8 scaling_list_dc_coef_16x16[6]; ++ __u8 scaling_list_dc_coef_32x32[2]; ++}; ++ ++#endif +diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c +index 463d352055..7feff43c28 100644 +--- a/libavcodec/hevc_parser.c ++++ b/libavcodec/hevc_parser.c +@@ -98,6 +98,19 @@ static int hevc_parse_slice_header(AVCodecParserContext *s, H2645NAL *nal, + avctx->profile = ps->sps->ptl.general_ptl.profile_idc; + avctx->level = ps->sps->ptl.general_ptl.level_idc; + ++ if (ps->sps->chroma_format_idc == 1) { ++ avctx->chroma_sample_location = ps->sps->vui.chroma_loc_info_present_flag ? ++ ps->sps->vui.chroma_sample_loc_type_top_field + 1 : ++ AVCHROMA_LOC_LEFT; ++ } ++ else if (ps->sps->chroma_format_idc == 2 || ++ ps->sps->chroma_format_idc == 3) { ++ avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;; ++ } ++ else { ++ avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED; ++ } ++ + if (ps->vps->vps_timing_info_present_flag) { + num = ps->vps->vps_num_units_in_tick; + den = ps->vps->vps_time_scale; +diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c +index 4f6d985ae6..eefae71275 100644 +--- a/libavcodec/hevc_refs.c ++++ b/libavcodec/hevc_refs.c +@@ -96,18 +96,22 @@ static HEVCFrame *alloc_frame(HEVCContext *s) + if (!frame->rpl_buf) + goto fail; + +- frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool); +- if (!frame->tab_mvf_buf) +- goto fail; +- frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data; ++ if (s->tab_mvf_pool) { ++ frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool); ++ if (!frame->tab_mvf_buf) ++ goto fail; ++ frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data; ++ } + +- frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool); +- if (!frame->rpl_tab_buf) +- goto fail; +- frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data; +- frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height; +- for (j = 0; j < frame->ctb_count; j++) +- frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data; ++ if (s->rpl_tab_pool) { ++ frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool); ++ if (!frame->rpl_tab_buf) ++ goto fail; ++ frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data; ++ frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height; ++ for (j = 0; j < frame->ctb_count; j++) ++ frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data; ++ } + + frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD; + frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD); +@@ -276,14 +280,17 @@ static int init_slice_rpl(HEVCContext *s) + int ctb_count = frame->ctb_count; + int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; + int i; ++ RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx; + + if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab)) + return AVERROR_INVALIDDATA; + +- for (i = ctb_addr_ts; i < ctb_count; i++) +- frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx; ++ if (frame->rpl_tab) { ++ for (i = ctb_addr_ts; i < ctb_count; i++) ++ frame->rpl_tab[i] = tab; ++ } + +- frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts]; ++ frame->refPicList = tab->refPicList; + + return 0; + } +diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c +index 2231aed259..7b05b41441 100644 +--- a/libavcodec/hevcdec.c ++++ b/libavcodec/hevcdec.c +@@ -333,6 +333,19 @@ static void export_stream_params(HEVCContext *s, const HEVCSPS *sps) + + ff_set_sar(avctx, sps->vui.sar); + ++ if (sps->chroma_format_idc == 1) { ++ avctx->chroma_sample_location = sps->vui.chroma_loc_info_present_flag ? ++ sps->vui.chroma_sample_loc_type_top_field + 1 : ++ AVCHROMA_LOC_LEFT; ++ } ++ else if (sps->chroma_format_idc == 2 || ++ sps->chroma_format_idc == 3) { ++ avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;; ++ } ++ else { ++ avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED; ++ } ++ + if (sps->vui.video_signal_type_present_flag) + avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG + : AVCOL_RANGE_MPEG; +@@ -392,14 +405,20 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) + #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \ + CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \ + CONFIG_HEVC_NVDEC_HWACCEL + \ ++ CONFIG_HEVC_V4L2REQUEST_HWACCEL + \ + CONFIG_HEVC_VAAPI_HWACCEL + \ + CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \ ++ CONFIG_HEVC_RPI4_8_HWACCEL + \ ++ CONFIG_HEVC_RPI4_10_HWACCEL + \ + CONFIG_HEVC_VDPAU_HWACCEL) + enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts; + + switch (sps->pix_fmt) { + case AV_PIX_FMT_YUV420P: + case AV_PIX_FMT_YUVJ420P: ++#if CONFIG_HEVC_RPI4_8_HWACCEL ++ *fmt++ = AV_PIX_FMT_RPI4_8; ++#endif + #if CONFIG_HEVC_DXVA2_HWACCEL + *fmt++ = AV_PIX_FMT_DXVA2_VLD; + #endif +@@ -418,9 +437,15 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) + #endif + #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX; ++#endif ++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL ++ *fmt++ = AV_PIX_FMT_DRM_PRIME; + #endif + break; + case AV_PIX_FMT_YUV420P10: ++#if CONFIG_HEVC_RPI4_10_HWACCEL ++ *fmt++ = AV_PIX_FMT_RPI4_10; ++#endif + #if CONFIG_HEVC_DXVA2_HWACCEL + *fmt++ = AV_PIX_FMT_DXVA2_VLD; + #endif +@@ -439,6 +464,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) + #endif + #if CONFIG_HEVC_NVDEC_HWACCEL + *fmt++ = AV_PIX_FMT_CUDA; ++#endif ++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL ++ *fmt++ = AV_PIX_FMT_DRM_PRIME; + #endif + break; + case AV_PIX_FMT_YUV444P: +@@ -485,6 +513,16 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, + if (!sps) + return 0; + ++ // If hwaccel then we don't need all the s/w decode helper arrays ++ if (s->avctx->hwaccel) { ++ export_stream_params(s, sps); ++ ++ s->avctx->pix_fmt = pix_fmt; ++ s->ps.sps = sps; ++ s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data; ++ return 0; ++ } ++ + ret = pic_arrays_init(s, sps); + if (ret < 0) + goto fail; +@@ -2901,11 +2939,13 @@ static int hevc_frame_start(HEVCContext *s) + ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1); + int ret; + +- memset(s->horizontal_bs, 0, s->bs_width * s->bs_height); +- memset(s->vertical_bs, 0, s->bs_width * s->bs_height); +- memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height); +- memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1)); +- memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address)); ++ if (s->horizontal_bs) { ++ memset(s->horizontal_bs, 0, s->bs_width * s->bs_height); ++ memset(s->vertical_bs, 0, s->bs_width * s->bs_height); ++ memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height); ++ memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1)); ++ memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address)); ++ } + + s->is_decoded = 0; + s->first_nal_type = s->nal_unit_type; +@@ -3327,7 +3367,14 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output, + s->ref = NULL; + ret = decode_nal_units(s, avpkt->data, avpkt->size); + if (ret < 0) ++ { ++ // Ensure that hwaccel knows this frame is over ++ if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame) { ++ s->avctx->hwaccel->abort_frame(s->avctx); ++ } ++ + return ret; ++ } + + if (avctx->hwaccel) { + if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) { +@@ -3370,15 +3417,19 @@ static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src) + if (ret < 0) + return ret; + +- dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf); +- if (!dst->tab_mvf_buf) +- goto fail; +- dst->tab_mvf = src->tab_mvf; ++ if (src->tab_mvf_buf) { ++ dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf); ++ if (!dst->tab_mvf_buf) ++ goto fail; ++ dst->tab_mvf = src->tab_mvf; ++ } + +- dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf); +- if (!dst->rpl_tab_buf) +- goto fail; +- dst->rpl_tab = src->rpl_tab; ++ if (src->rpl_tab_buf) { ++ dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf); ++ if (!dst->rpl_tab_buf) ++ goto fail; ++ dst->rpl_tab = src->rpl_tab; ++ } + + dst->rpl_buf = av_buffer_ref(src->rpl_buf); + if (!dst->rpl_buf) +@@ -3697,6 +3748,15 @@ AVCodec ff_hevc_decoder = { + #if CONFIG_HEVC_NVDEC_HWACCEL + HWACCEL_NVDEC(hevc), + #endif ++#if CONFIG_HEVC_RPI4_8_HWACCEL ++ HWACCEL_RPI4_8(hevc), ++#endif ++#if CONFIG_HEVC_RPI4_10_HWACCEL ++ HWACCEL_RPI4_10(hevc), ++#endif ++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL ++ HWACCEL_V4L2REQUEST(hevc), ++#endif + #if CONFIG_HEVC_VAAPI_HWACCEL + HWACCEL_VAAPI(hevc), + #endif +diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h +index 8e54cf73f9..2277aadf75 100644 +--- a/libavcodec/hwaccels.h ++++ b/libavcodec/hwaccels.h +@@ -39,6 +39,9 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel; + extern const AVHWAccel ff_hevc_d3d11va2_hwaccel; + extern const AVHWAccel ff_hevc_dxva2_hwaccel; + extern const AVHWAccel ff_hevc_nvdec_hwaccel; ++extern const AVHWAccel ff_hevc_rpi4_8_hwaccel; ++extern const AVHWAccel ff_hevc_rpi4_10_hwaccel; ++extern const AVHWAccel ff_hevc_v4l2request_hwaccel; + extern const AVHWAccel ff_hevc_vaapi_hwaccel; + extern const AVHWAccel ff_hevc_vdpau_hwaccel; + extern const AVHWAccel ff_hevc_videotoolbox_hwaccel; +diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h +index f421dc909f..f93283b893 100644 +--- a/libavcodec/hwconfig.h ++++ b/libavcodec/hwconfig.h +@@ -24,6 +24,7 @@ + + + #define HWACCEL_CAP_ASYNC_SAFE (1 << 0) ++#define HWACCEL_CAP_MT_SAFE (1 << 1) + + + typedef struct AVCodecHWConfigInternal { +@@ -70,6 +71,12 @@ typedef struct AVCodecHWConfigInternal { + HW_CONFIG_HWACCEL(1, 1, 0, D3D11, D3D11VA, ff_ ## codec ## _d3d11va2_hwaccel) + #define HWACCEL_NVDEC(codec) \ + HW_CONFIG_HWACCEL(1, 1, 0, CUDA, CUDA, ff_ ## codec ## _nvdec_hwaccel) ++#define HWACCEL_RPI4_8(codec) \ ++ HW_CONFIG_HWACCEL(0, 0, 1, RPI4_8, NONE, ff_ ## codec ## _rpi4_8_hwaccel) ++#define HWACCEL_RPI4_10(codec) \ ++ HW_CONFIG_HWACCEL(0, 0, 1, RPI4_10, NONE, ff_ ## codec ## _rpi4_10_hwaccel) ++#define HWACCEL_V4L2REQUEST(codec) \ ++ HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME, DRM, ff_ ## codec ## _v4l2request_hwaccel) + #define HWACCEL_VAAPI(codec) \ + HW_CONFIG_HWACCEL(1, 1, 1, VAAPI, VAAPI, ff_ ## codec ## _vaapi_hwaccel) + #define HWACCEL_VDPAU(codec) \ +diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c +index cb15ac072a..f6261db962 100644 +--- a/libavcodec/mmaldec.c ++++ b/libavcodec/mmaldec.c +@@ -24,6 +24,9 @@ + * MMAL Video Decoder + */ + ++#pragma GCC diagnostic push ++// Many many redundant decls in the header files ++#pragma GCC diagnostic ignored "-Wredundant-decls" + #include + #include + #include +@@ -31,6 +34,7 @@ + #include + #include + #include ++#pragma GCC diagnostic pop + #include + + #include "avcodec.h" +diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c +index 9176027f15..0b0ff03c18 100644 +--- a/libavcodec/pthread_frame.c ++++ b/libavcodec/pthread_frame.c +@@ -209,7 +209,8 @@ FF_ENABLE_DEPRECATION_WARNINGS + + /* if the previous thread uses hwaccel then we take the lock to ensure + * the threads don't run concurrently */ +- if (avctx->hwaccel) { ++ if (avctx->hwaccel && ++ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) { + pthread_mutex_lock(&p->parent->hwaccel_mutex); + p->hwaccel_serializing = 1; + } +@@ -636,7 +637,9 @@ void ff_thread_finish_setup(AVCodecContext *avctx) { + + if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return; + +- if (avctx->hwaccel && !p->hwaccel_serializing) { ++ if (avctx->hwaccel && ++ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) && ++ !p->hwaccel_serializing) { + pthread_mutex_lock(&p->parent->hwaccel_mutex); + p->hwaccel_serializing = 1; + } +diff --git a/libavcodec/raw.c b/libavcodec/raw.c +index 079d5c5d10..0781f28615 100644 +--- a/libavcodec/raw.c ++++ b/libavcodec/raw.c +@@ -294,6 +294,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = { + { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ + { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ + ++ /* RPI (Might as well define for everything) */ ++ { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') }, ++ { AV_PIX_FMT_RPI4_8, MKTAG('S', 'A', 'N', 'D') }, ++ { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') }, ++ { AV_PIX_FMT_RPI4_10, MKTAG('S', 'N', 'D', 'B') }, ++ + { AV_PIX_FMT_NONE, 0 }, + }; + +diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c +index d181b74570..b943dd0379 100644 +--- a/libavcodec/rawenc.c ++++ b/libavcodec/rawenc.c +@@ -24,6 +24,7 @@ + * Raw Video Encoder + */ + ++#include "config.h" + #include "avcodec.h" + #include "raw.h" + #include "internal.h" +@@ -31,6 +32,10 @@ + #include "libavutil/intreadwrite.h" + #include "libavutil/imgutils.h" + #include "libavutil/internal.h" ++#include "libavutil/avassert.h" ++#if CONFIG_SAND ++#include "libavutil/rpi_sand_fns.h" ++#endif + + static av_cold int raw_encode_init(AVCodecContext *avctx) + { +@@ -49,22 +54,114 @@ FF_ENABLE_DEPRECATION_WARNINGS + return 0; + } + ++#if CONFIG_SAND ++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) ++{ ++ const int width = av_frame_cropped_width(frame); ++ const int height = av_frame_cropped_height(frame); ++ const int x0 = frame->crop_left; ++ const int y0 = frame->crop_top; ++ const int size = width * height * 3 / 2; ++ uint8_t * dst; ++ int ret; ++ ++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) ++ return ret; ++ ++ dst = pkt->data; ++ ++ av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); ++ dst += width * height; ++ av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2); ++ return 0; ++} ++ ++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) ++{ ++ const int width = av_frame_cropped_width(frame); ++ const int height = av_frame_cropped_height(frame); ++ const int x0 = frame->crop_left; ++ const int y0 = frame->crop_top; ++ const int size = width * height * 3; ++ uint8_t * dst; ++ int ret; ++ ++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) ++ return ret; ++ ++ dst = pkt->data; ++ ++ av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height); ++ dst += width * height * 2; ++ av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2); ++ return 0; ++} ++ ++static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) ++{ ++ const int width = av_frame_cropped_width(frame); ++ const int height = av_frame_cropped_height(frame); ++ const int x0 = frame->crop_left; ++ const int y0 = frame->crop_top; ++ const int size = width * height * 3; ++ uint8_t * dst; ++ int ret; ++ ++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) ++ return ret; ++ ++ dst = pkt->data; ++ ++ av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); ++ dst += width * height * 2; ++ av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2); ++ return 0; ++} ++#endif ++ ++ + static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, +- const AVFrame *frame, int *got_packet) ++ const AVFrame *src_frame, int *got_packet) + { +- int ret = av_image_get_buffer_size(frame->format, +- frame->width, frame->height, 1); ++ int ret; ++ AVFrame * frame = NULL; + +- if (ret < 0) ++#if CONFIG_SAND ++ if (av_rpi_is_sand_frame(src_frame)) { ++ ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) : ++ av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) : ++ av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1; ++ *got_packet = (ret == 0); + return ret; ++ } ++#endif ++ ++ if ((frame = av_frame_clone(src_frame)) == NULL) { ++ ret = AVERROR(ENOMEM); ++ goto fail; ++ } ++ ++ if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0) ++ goto fail; ++ ++ ret = av_image_get_buffer_size(frame->format, ++ frame->width, frame->height, 1); ++ if (ret < 0) ++ goto fail; + + if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0) +- return ret; ++ goto fail; + if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, + (const uint8_t **)frame->data, frame->linesize, + frame->format, + frame->width, frame->height, 1)) < 0) +- return ret; ++ goto fail; + + if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 && + frame->format == AV_PIX_FMT_YUYV422) { +@@ -81,8 +178,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, + } + } + pkt->flags |= AV_PKT_FLAG_KEY; ++ av_frame_free(&frame); + *got_packet = 1; + return 0; ++ ++fail: ++ av_frame_free(&frame); ++ *got_packet = 0; ++ return ret; + } + + AVCodec ff_rawvideo_encoder = { +diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c +new file mode 100644 +index 0000000000..58c094c5f8 +--- /dev/null ++++ b/libavcodec/rpi_hevc_cabac.c +@@ -0,0 +1,2257 @@ ++/* ++ * HEVC CABAC decoding ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#define UNCHECKED_BITSTREAM_READER 1 ++ ++#include "libavutil/attributes.h" ++#include "libavutil/common.h" ++ ++#include "cabac_functions.h" ++#include "rpi_hevc_data.h" ++#include "hevc.h" ++#include "rpi_hevcdec.h" ++#include "rpi_hevc_cabac_fns.h" ++ ++#include "libavutil/rpi_sand_fns.h" ++ ++// BY22 is probably faster than simple bypass if the processor has ++// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction ++// x86 has fast int divide ++// Arm doesn't have divide or general fast 64 bit, but does have the multiply ++// * Beware: ARCH_xxx isn't set if configure --disable-asm is used ++#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86) ++// Use native divide if we have a fast one - otherwise use mpy 1/x ++// x86 has a fast integer divide - arm doesn't - unsure about other ++// architectures ++#define USE_BY22_DIV ARCH_X86 ++ ++// Special case blocks with a single significant ceoff ++// Decreases the complexity of the code for a common case but increases the ++// code size. ++#define USE_N_END_1 1 ++ ++#if !USE_BY22_DIV ++// * 1/x @ 32 bits gets us 22 bits of accuracy ++#define CABAC_BY22_PEEK_BITS 22 ++#else ++// A real 32-bit divide gets us another bit ++// If we have a 64 bit int & a unit time divider then we should get a lot ++// of bits (55) but that is untested and it is unclear if it would give ++// us a large advantage ++#define CABAC_BY22_PEEK_BITS 23 ++#endif ++ ++#define CABAC_MAX_BIN 31 ++ ++ ++#if USE_BY22 && !USE_BY22_DIV ++#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL) ++ ++static const uint32_t cabac_by22_inv_range[256] = { ++ 0, I(257), I(258), I(259), ++ I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269), ++ I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279), ++ I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289), ++ I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299), ++ I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309), ++ I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319), ++ I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329), ++ I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339), ++ I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349), ++ I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359), ++ I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369), ++ I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379), ++ I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389), ++ I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399), ++ I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409), ++ I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419), ++ I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429), ++ I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439), ++ I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449), ++ I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459), ++ I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469), ++ I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479), ++ I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489), ++ I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499), ++ I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509), ++ I(510), I(511) ++}; ++#undef I ++#endif // USE_BY22 ++ ++#if ARCH_ARM ++#include "arm/rpi_hevc_cabac.h" ++#endif ++ ++/** ++ * number of bin by SyntaxElement. ++ */ ++static const int8_t num_bins_in_se[] = { ++ 1, // sao_merge_flag ++ 1, // sao_type_idx ++ 0, // sao_eo_class ++ 0, // sao_band_position ++ 0, // sao_offset_abs ++ 0, // sao_offset_sign ++ 0, // end_of_slice_flag ++ 3, // split_coding_unit_flag ++ 1, // cu_transquant_bypass_flag ++ 3, // skip_flag ++ 3, // cu_qp_delta ++ 1, // pred_mode ++ 4, // part_mode ++ 0, // pcm_flag ++ 1, // prev_intra_luma_pred_mode ++ 0, // mpm_idx ++ 0, // rem_intra_luma_pred_mode ++ 2, // intra_chroma_pred_mode ++ 1, // merge_flag ++ 1, // merge_idx ++ 5, // inter_pred_idc ++ 2, // ref_idx_l0 ++ 2, // ref_idx_l1 ++ 2, // abs_mvd_greater0_flag ++ 2, // abs_mvd_greater1_flag ++ 0, // abs_mvd_minus2 ++ 0, // mvd_sign_flag ++ 1, // mvp_lx_flag ++ 1, // no_residual_data_flag ++ 3, // split_transform_flag ++ 2, // cbf_luma ++ 4, // cbf_cb, cbf_cr ++ 2, // transform_skip_flag[][] ++ 2, // explicit_rdpcm_flag[][] ++ 2, // explicit_rdpcm_dir_flag[][] ++ 18, // last_significant_coeff_x_prefix ++ 18, // last_significant_coeff_y_prefix ++ 0, // last_significant_coeff_x_suffix ++ 0, // last_significant_coeff_y_suffix ++ 4, // significant_coeff_group_flag ++ 44, // significant_coeff_flag ++ 24, // coeff_abs_level_greater1_flag ++ 6, // coeff_abs_level_greater2_flag ++ 0, // coeff_abs_level_remaining ++ 0, // coeff_sign_flag ++ 8, // log2_res_scale_abs ++ 2, // res_scale_sign_flag ++ 1, // cu_chroma_qp_offset_flag ++ 1, // cu_chroma_qp_offset_idx ++}; ++ ++/** ++ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement. ++ */ ++static const int elem_offset[sizeof(num_bins_in_se)] = { ++ 0, // sao_merge_flag ++ 1, // sao_type_idx ++ 2, // sao_eo_class ++ 2, // sao_band_position ++ 2, // sao_offset_abs ++ 2, // sao_offset_sign ++ 2, // end_of_slice_flag ++ 2, // split_coding_unit_flag ++ 5, // cu_transquant_bypass_flag ++ 6, // skip_flag ++ 9, // cu_qp_delta ++ 12, // pred_mode ++ 13, // part_mode ++ 17, // pcm_flag ++ 17, // prev_intra_luma_pred_mode ++ 18, // mpm_idx ++ 18, // rem_intra_luma_pred_mode ++ 18, // intra_chroma_pred_mode ++ 20, // merge_flag ++ 21, // merge_idx ++ 22, // inter_pred_idc ++ 27, // ref_idx_l0 ++ 29, // ref_idx_l1 ++ 31, // abs_mvd_greater0_flag ++ 33, // abs_mvd_greater1_flag ++ 35, // abs_mvd_minus2 ++ 35, // mvd_sign_flag ++ 35, // mvp_lx_flag ++ 36, // no_residual_data_flag ++ 37, // split_transform_flag ++ 40, // cbf_luma ++ 42, // cbf_cb, cbf_cr ++ 46, // transform_skip_flag[][] ++ 48, // explicit_rdpcm_flag[][] ++ 50, // explicit_rdpcm_dir_flag[][] ++ 52, // last_significant_coeff_x_prefix ++ 70, // last_significant_coeff_y_prefix ++ 88, // last_significant_coeff_x_suffix ++ 88, // last_significant_coeff_y_suffix ++ 88, // significant_coeff_group_flag ++ 92, // significant_coeff_flag ++ 136, // coeff_abs_level_greater1_flag ++ 160, // coeff_abs_level_greater2_flag ++ 166, // coeff_abs_level_remaining ++ 166, // coeff_sign_flag ++ 166, // log2_res_scale_abs ++ 174, // res_scale_sign_flag ++ 176, // cu_chroma_qp_offset_flag ++ 177, // cu_chroma_qp_offset_idx ++}; ++ ++#define CNU 154 ++/** ++ * Indexed by init_type ++ */ ++static const uint8_t init_values[3][HEVC_CONTEXTS] = { ++ { // sao_merge_flag ++ 153, ++ // sao_type_idx ++ 200, ++ // split_coding_unit_flag ++ 139, 141, 157, ++ // cu_transquant_bypass_flag ++ 154, ++ // skip_flag ++ CNU, CNU, CNU, ++ // cu_qp_delta ++ 154, 154, 154, ++ // pred_mode ++ CNU, ++ // part_mode ++ 184, CNU, CNU, CNU, ++ // prev_intra_luma_pred_mode ++ 184, ++ // intra_chroma_pred_mode ++ 63, 139, ++ // merge_flag ++ CNU, ++ // merge_idx ++ CNU, ++ // inter_pred_idc ++ CNU, CNU, CNU, CNU, CNU, ++ // ref_idx_l0 ++ CNU, CNU, ++ // ref_idx_l1 ++ CNU, CNU, ++ // abs_mvd_greater1_flag ++ CNU, CNU, ++ // abs_mvd_greater1_flag ++ CNU, CNU, ++ // mvp_lx_flag ++ CNU, ++ // no_residual_data_flag ++ CNU, ++ // split_transform_flag ++ 153, 138, 138, ++ // cbf_luma ++ 111, 141, ++ // cbf_cb, cbf_cr ++ 94, 138, 182, 154, ++ // transform_skip_flag ++ 139, 139, ++ // explicit_rdpcm_flag ++ 139, 139, ++ // explicit_rdpcm_dir_flag ++ 139, 139, ++ // last_significant_coeff_x_prefix ++ 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111, ++ 79, 108, 123, 63, ++ // last_significant_coeff_y_prefix ++ 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111, ++ 79, 108, 123, 63, ++ // significant_coeff_group_flag ++ 91, 171, 134, 141, ++ // significant_coeff_flag ++ 111, 111, 125, 110, 110, 94, 124, 108, 124, 107, 125, 141, 179, 153, ++ 125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140, ++ 139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111, ++ 141, 111, ++ // coeff_abs_level_greater1_flag ++ 140, 92, 137, 138, 140, 152, 138, 139, 153, 74, 149, 92, 139, 107, ++ 122, 152, 140, 179, 166, 182, 140, 227, 122, 197, ++ // coeff_abs_level_greater2_flag ++ 138, 153, 136, 167, 152, 152, ++ // log2_res_scale_abs ++ 154, 154, 154, 154, 154, 154, 154, 154, ++ // res_scale_sign_flag ++ 154, 154, ++ // cu_chroma_qp_offset_flag ++ 154, ++ // cu_chroma_qp_offset_idx ++ 154, ++ }, ++ { // sao_merge_flag ++ 153, ++ // sao_type_idx ++ 185, ++ // split_coding_unit_flag ++ 107, 139, 126, ++ // cu_transquant_bypass_flag ++ 154, ++ // skip_flag ++ 197, 185, 201, ++ // cu_qp_delta ++ 154, 154, 154, ++ // pred_mode ++ 149, ++ // part_mode ++ 154, 139, 154, 154, ++ // prev_intra_luma_pred_mode ++ 154, ++ // intra_chroma_pred_mode ++ 152, 139, ++ // merge_flag ++ 110, ++ // merge_idx ++ 122, ++ // inter_pred_idc ++ 95, 79, 63, 31, 31, ++ // ref_idx_l0 ++ 153, 153, ++ // ref_idx_l1 ++ 153, 153, ++ // abs_mvd_greater1_flag ++ 140, 198, ++ // abs_mvd_greater1_flag ++ 140, 198, ++ // mvp_lx_flag ++ 168, ++ // no_residual_data_flag ++ 79, ++ // split_transform_flag ++ 124, 138, 94, ++ // cbf_luma ++ 153, 111, ++ // cbf_cb, cbf_cr ++ 149, 107, 167, 154, ++ // transform_skip_flag ++ 139, 139, ++ // explicit_rdpcm_flag ++ 139, 139, ++ // explicit_rdpcm_dir_flag ++ 139, 139, ++ // last_significant_coeff_x_prefix ++ 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95, ++ 94, 108, 123, 108, ++ // last_significant_coeff_y_prefix ++ 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95, ++ 94, 108, 123, 108, ++ // significant_coeff_group_flag ++ 121, 140, 61, 154, ++ // significant_coeff_flag ++ 155, 154, 139, 153, 139, 123, 123, 63, 153, 166, 183, 140, 136, 153, ++ 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, ++ 153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140, ++ 140, 140, ++ // coeff_abs_level_greater1_flag ++ 154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121, ++ 136, 137, 169, 194, 166, 167, 154, 167, 137, 182, ++ // coeff_abs_level_greater2_flag ++ 107, 167, 91, 122, 107, 167, ++ // log2_res_scale_abs ++ 154, 154, 154, 154, 154, 154, 154, 154, ++ // res_scale_sign_flag ++ 154, 154, ++ // cu_chroma_qp_offset_flag ++ 154, ++ // cu_chroma_qp_offset_idx ++ 154, ++ }, ++ { // sao_merge_flag ++ 153, ++ // sao_type_idx ++ 160, ++ // split_coding_unit_flag ++ 107, 139, 126, ++ // cu_transquant_bypass_flag ++ 154, ++ // skip_flag ++ 197, 185, 201, ++ // cu_qp_delta ++ 154, 154, 154, ++ // pred_mode ++ 134, ++ // part_mode ++ 154, 139, 154, 154, ++ // prev_intra_luma_pred_mode ++ 183, ++ // intra_chroma_pred_mode ++ 152, 139, ++ // merge_flag ++ 154, ++ // merge_idx ++ 137, ++ // inter_pred_idc ++ 95, 79, 63, 31, 31, ++ // ref_idx_l0 ++ 153, 153, ++ // ref_idx_l1 ++ 153, 153, ++ // abs_mvd_greater1_flag ++ 169, 198, ++ // abs_mvd_greater1_flag ++ 169, 198, ++ // mvp_lx_flag ++ 168, ++ // no_residual_data_flag ++ 79, ++ // split_transform_flag ++ 224, 167, 122, ++ // cbf_luma ++ 153, 111, ++ // cbf_cb, cbf_cr ++ 149, 92, 167, 154, ++ // transform_skip_flag ++ 139, 139, ++ // explicit_rdpcm_flag ++ 139, 139, ++ // explicit_rdpcm_dir_flag ++ 139, 139, ++ // last_significant_coeff_x_prefix ++ 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111, ++ 79, 108, 123, 93, ++ // last_significant_coeff_y_prefix ++ 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111, ++ 79, 108, 123, 93, ++ // significant_coeff_group_flag ++ 121, 140, 61, 154, ++ // significant_coeff_flag ++ 170, 154, 139, 153, 139, 123, 123, 63, 124, 166, 183, 140, 136, 153, ++ 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, ++ 153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140, ++ 140, 140, ++ // coeff_abs_level_greater1_flag ++ 154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121, ++ 136, 122, 169, 208, 166, 167, 154, 152, 167, 182, ++ // coeff_abs_level_greater2_flag ++ 107, 167, 91, 107, 107, 167, ++ // log2_res_scale_abs ++ 154, 154, 154, 154, 154, 154, 154, 154, ++ // res_scale_sign_flag ++ 154, 154, ++ // cu_chroma_qp_offset_flag ++ 154, ++ // cu_chroma_qp_offset_idx ++ 154, ++ }, ++}; ++ ++static const uint8_t scan_1x1[1] = { ++ 0, ++}; ++ ++static const uint8_t horiz_scan2x2_x[4] = { ++ 0, 1, 0, 1, ++}; ++ ++static const uint8_t horiz_scan2x2_y[4] = { ++ 0, 0, 1, 1 ++}; ++ ++static const uint8_t horiz_scan4x4_x[16] = { ++ 0, 1, 2, 3, ++ 0, 1, 2, 3, ++ 0, 1, 2, 3, ++ 0, 1, 2, 3, ++}; ++ ++static const uint8_t horiz_scan4x4_y[16] = { ++ 0, 0, 0, 0, ++ 1, 1, 1, 1, ++ 2, 2, 2, 2, ++ 3, 3, 3, 3, ++}; ++ ++static const uint8_t horiz_scan8x8_inv[8][8] = { ++ { 0, 1, 2, 3, 16, 17, 18, 19, }, ++ { 4, 5, 6, 7, 20, 21, 22, 23, }, ++ { 8, 9, 10, 11, 24, 25, 26, 27, }, ++ { 12, 13, 14, 15, 28, 29, 30, 31, }, ++ { 32, 33, 34, 35, 48, 49, 50, 51, }, ++ { 36, 37, 38, 39, 52, 53, 54, 55, }, ++ { 40, 41, 42, 43, 56, 57, 58, 59, }, ++ { 44, 45, 46, 47, 60, 61, 62, 63, }, ++}; ++ ++static const uint8_t diag_scan2x2_x[4] = { ++ 0, 0, 1, 1, ++}; ++ ++static const uint8_t diag_scan2x2_y[4] = { ++ 0, 1, 0, 1, ++}; ++ ++static const uint8_t diag_scan2x2_inv[2][2] = { ++ { 0, 2, }, ++ { 1, 3, }, ++}; ++ ++static const uint8_t diag_scan4x4_inv[4][4] = { ++ { 0, 2, 5, 9, }, ++ { 1, 4, 8, 12, }, ++ { 3, 7, 11, 14, }, ++ { 6, 10, 13, 15, }, ++}; ++ ++static const uint8_t diag_scan8x8_inv[8][8] = { ++ { 0, 2, 5, 9, 14, 20, 27, 35, }, ++ { 1, 4, 8, 13, 19, 26, 34, 42, }, ++ { 3, 7, 12, 18, 25, 33, 41, 48, }, ++ { 6, 11, 17, 24, 32, 40, 47, 53, }, ++ { 10, 16, 23, 31, 39, 46, 52, 57, }, ++ { 15, 22, 30, 38, 45, 51, 56, 60, }, ++ { 21, 29, 37, 44, 50, 55, 59, 62, }, ++ { 28, 36, 43, 49, 54, 58, 61, 63, }, ++}; ++ ++ ++typedef struct ++{ ++ uint16_t coeff; ++ uint16_t scale; ++} xy_off_t; ++ ++#define XYT_C(x,y,t) ((x) + ((y) << (t))) ++#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t)) ++#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t)) ++#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t))) ++ ++#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)} ++ ++#define OFF_DIAG(t) {\ ++ XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\ ++ XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\ ++ XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\ ++ XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\ ++} ++ ++#define OFF_HORIZ(t) {\ ++ XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\ ++ XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\ ++ XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\ ++ XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\ ++} ++ ++#define OFF_VERT(t) {\ ++ XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\ ++ XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\ ++ XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\ ++ XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\ ++} ++ ++static const xy_off_t off_xys[3][4][16] = ++{ ++ {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)}, ++ {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)}, ++ {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)} ++}; ++ ++ ++// Helper fns ++#ifndef hevc_mem_bits32 ++static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset) ++{ ++ return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7); ++} ++#endif ++ ++#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32) ++#define hevc_clz32 hevc_clz32_builtin ++static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x) ++{ ++ // __builtin_clz says it works on ints - so adjust if int is >32 bits long ++ return __builtin_clz(x) - (sizeof(int) * 8 - 32); ++} ++#endif ++ ++// It is unlikely that we will ever need this but include for completeness ++#ifndef hevc_clz32 ++static inline unsigned int hevc_clz32(unsigned int x) ++{ ++ unsigned int n = 1; ++ if ((x & 0xffff0000) == 0) { ++ n += 16; ++ x <<= 16; ++ } ++ if ((x & 0xff000000) == 0) { ++ n += 8; ++ x <<= 8; ++ } ++ if ((x & 0xf0000000) == 0) { ++ n += 4; ++ x <<= 4; ++ } ++ if ((x & 0xc0000000) == 0) { ++ n += 2; ++ x <<= 2; ++ } ++ return n - ((x >> 31) & 1); ++} ++#endif ++ ++static inline int cabac_overflow(const CABACContext * const cc) ++{ ++ av_assert0(cc->bytestream >= cc->bytestream_start); ++ return cc->bytestream >= cc->bytestream_end + 4; ++} ++ ++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc) ++{ ++ return cabac_overflow(&lc->cc); ++} ++ ++#if !USE_BY22 ++// If no by22 then _by22 functions will revert to normal and so _peek/_flush ++// will no longer be called but the setup calls will still exist and we want ++// to null them out ++#define bypass_start(s) ++#define bypass_finish(s) ++#else ++// Use BY22 for residual bypass block ++ ++#define bypass_start(cc) get_cabac_by22_start(cc) ++#define bypass_finish(cc) get_cabac_by22_finish(cc) ++ ++// BY22 notes that bypass is simply a divide into the bitstream and so we ++// can peek out large quantities of bits at once and treat the result as if ++// it was VLC. In many cases this will lead to O(1) processing rather than ++// O(n) though the setup and teardown is sufficiently expensive that it is ++// only worth using if we expect to be dealing with more than a few bits ++// The definition of "a few bits" will vary from platform to platform but ++// tests on ARM show that it probably isn't worth it for a single coded ++// residual, but is for >1 - it also seems likely that if there are ++// more residuals then they are likely to be bigger and this will make the ++// O(1) nature of the code more worthwhile. ++ ++ ++// Bypass block start ++// Must be called before _by22_peek is used as it sets the CABAC environment ++// into the correct state. _by22_finish must be called to return to 'normal' ++// (i.e. non-bypass) cabac decoding ++#ifndef get_cabac_by22_start ++static inline void get_cabac_by22_start(CABACContext * const c) ++{ ++ const unsigned int bits = __builtin_ctz(c->low); ++ const uint32_t m = hevc_mem_bits32(c->bytestream, 0); ++ uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits)); ++#if !USE_BY22_DIV ++ const uint32_t inv = cabac_by22_inv_range[c->range & 0xff]; ++#endif ++ ++ c->bytestream -= (CABAC_BITS / 8); ++ c->by22.bits = bits; ++#if !USE_BY22_DIV ++ c->by22.range = c->range; ++ c->range = inv; ++#endif ++ c->low = x; ++} ++#endif ++ ++// Bypass block finish ++// Must be called at the end of the bypass block to return to normal operation ++static inline void get_cabac_by22_finish(CABACContext * const c) ++{ ++ unsigned int used = c->by22.bits; ++ unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8); ++ unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7); ++ ++ c->bytestream += bytes_used + (CABAC_BITS / 8); ++ c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used; ++#if !USE_BY22_DIV ++ c->range = c->by22.range; ++#endif ++} ++ ++// Peek bypass bits ++// _by22_start must be called before _by22_peek is called and _by22_flush ++// must be called afterwards to flush any used bits ++// The actual number of valid bits returned is ++// min(, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS ++// will be at least 22 which should be long enough for any prefix or suffix ++// though probably not long enough for the worst case combination ++#ifndef get_cabac_by22_peek ++static inline uint32_t get_cabac_by22_peek(const CABACContext * const c) ++{ ++#if USE_BY22_DIV ++ return ((unsigned int)c->low / (unsigned int)c->range) << 9; ++#else ++ uint32_t x = c->low & ~1U; ++ const uint32_t inv = c->range; ++ ++ if (inv != 0) ++ x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32); ++ ++ return x << 1; ++#endif ++} ++#endif ++ ++// Flush bypass bits peeked by _by22_peek ++// Flush n bypass bits. n must be >= 1 to guarantee correct operation ++// val is an unmodified copy of whatever _by22_peek returned ++#ifndef get_cabac_by22_flush ++static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val) ++{ ++ // Subtract the bits used & reshift up to the top of the word ++#if USE_BY22_DIV ++ const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23)); ++#else ++ const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23)); ++#endif ++ ++ // and refill lower bits ++ // We will probably OR over some existing bits but that doesn't matter ++ c->by22.bits += n; ++ c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9); ++} ++#endif ++ ++#endif // USE_BY22 ++ ++ ++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc) ++{ ++ memcpy(s->cabac_save->rice, lc->stat_coeff, 4); ++ memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS); ++} ++ ++static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ memcpy(lc->stat_coeff, s->cabac_save->rice, 4); ++ memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS); ++} ++ ++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc) ++{ ++ GetBitContext * const gb = &lc->gb; ++ skip_bits(gb, 1); ++ align_get_bits(gb); ++ return ff_init_cabac_decoder(&lc->cc, ++ gb->buffer + get_bits_count(gb) / 8, ++ (get_bits_left(gb) + 7) / 8); ++} ++ ++static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ int init_type = 2 - s->sh.slice_type; ++ int i; ++ ++ if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ++ init_type ^= 3; ++ ++ for (i = 0; i < HEVC_CONTEXTS; i++) { ++ int init_value = init_values[init_type][i]; ++ int m = (init_value >> 4) * 5 - 45; ++ int n = ((init_value & 15) << 3) - 16; ++ int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127; ++ ++ pre ^= pre >> 31; ++ if (pre > 124) ++ pre = 124 + (pre & 1); ++ lc->cabac_state[i] = pre; ++ } ++ ++ for (i = 0; i < 4; i++) ++ lc->stat_coeff[i] = 0; ++} ++ ++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags) ++{ ++ if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0) ++ { ++ lc->qPy_pred = s->sh.slice_qp; ++ cabac_init_state(s, lc); ++ } ++ else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0) ++ { ++ lc->qPy_pred = s->sh.slice_qp; ++ load_states(s, lc); ++ } ++ lc->cabac_init_req = 0; ++} ++ ++#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx)) ++ ++int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state) ++{ ++ return get_cabac_inline(c, state); ++} ++ ++int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c) ++{ ++ return get_cabac_terminate(c); ++} ++ ++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc) ++{ ++ if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX])) ++ return 0; ++ ++ if (!get_cabac_bypass(&lc->cc)) ++ return SAO_BAND; ++ return SAO_EDGE; ++} ++ ++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc) ++{ ++ int i; ++ int value = get_cabac_bypass(&lc->cc); ++ ++ for (i = 0; i < 4; i++) ++ value = (value << 1) | get_cabac_bypass(&lc->cc); ++ return value; ++} ++ ++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ int i = 0; ++ int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1; ++ ++ while (i < length && get_cabac_bypass(&lc->cc)) ++ i++; ++ return i; ++} ++ ++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc) ++{ ++ return get_cabac_bypass(&lc->cc); ++} ++ ++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc) ++{ ++ int ret = get_cabac_bypass(&lc->cc) << 1; ++ ret |= get_cabac_bypass(&lc->cc); ++ return ret; ++} ++ ++int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc) ++{ ++ int val = 1; ++ ++ if (get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA) == 0) ++ return 0; ++ ++ while (val < 5 && ++ get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA + 1) != 0) ++ val++; ++ ++ if (val >= 5) { ++ unsigned int k = 0; ++ while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) { ++ val += 1 << k; ++ k++; ++ } ++// if (k == CABAC_MAX_BIN) ++// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k); ++ ++ while (k--) ++ val += get_cabac_bypass(&lc->cc) << k; ++ } ++ return get_cabac_bypass(&lc->cc) ? -val : val; ++} ++ ++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1); ++ int i = 0; ++ ++ while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX])) ++ i++; ++ ++ return i; ++} ++ ++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size) ++{ ++ if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1 ++ return PART_2Nx2N; ++ if (log2_cb_size == s->ps.sps->log2_min_cb_size) { ++ if (lc->cu.pred_mode == MODE_INTRA) // 0 ++ return PART_NxN; ++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01 ++ return PART_2NxN; ++ if (log2_cb_size == 3) // 00 ++ return PART_Nx2N; ++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001 ++ return PART_Nx2N; ++ return PART_NxN; // 000 ++ } ++ ++ if (!s->ps.sps->amp_enabled_flag) { ++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01 ++ return PART_2NxN; ++ return PART_Nx2N; ++ } ++ ++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX ++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011 ++ return PART_2NxN; ++ if (get_cabac_bypass(&lc->cc)) // 0101 ++ return PART_2NxnD; ++ return PART_2NxnU; // 0100 ++ } ++ ++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001 ++ return PART_Nx2N; ++ if (get_cabac_bypass(&lc->cc)) // 0001 ++ return PART_nRx2N; ++ return PART_nLx2N; // 0000 ++} ++ ++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc) ++{ ++ int i = 0; ++ while (i < 2 && get_cabac_bypass(&lc->cc)) ++ i++; ++ return i; ++} ++ ++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc) ++{ ++ int i; ++ int value = get_cabac_bypass(&lc->cc); ++ ++ for (i = 0; i < 4; i++) ++ value = (value << 1) | get_cabac_bypass(&lc->cc); ++ return value; ++} ++ ++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc) ++{ ++ int ret; ++ if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE])) ++ return 4; ++ ++ ret = get_cabac_bypass(&lc->cc) << 1; ++ ret |= get_cabac_bypass(&lc->cc); ++ return ret; ++} ++ ++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ int i = GET_CABAC_LC(elem_offset[MERGE_IDX]); ++ ++ if (i != 0) { ++ while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc)) ++ i++; ++ } ++ return i; ++} ++ ++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH) ++{ ++ if (nPbW + nPbH == 12) ++ return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4); ++ if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth)) ++ return PRED_BI; ++ ++ return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4); ++} ++ ++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx) ++{ ++ int i = 0; ++ int max = num_ref_idx_lx - 1; ++ int max_ctx = FFMIN(max, 2); ++ ++ while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i)) ++ i++; ++ if (i == 2) { ++ while (i < max && get_cabac_bypass(&lc->cc)) ++ i++; ++ } ++ ++ return i; ++} ++ ++static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]); ++} ++ ++static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1); ++} ++ ++#if !USE_BY22 ++static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc) ++{ ++ int ret = 2; ++ int k = 1; ++ ++ while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) { ++ ret += 1U << k; ++ k++; ++ } ++ if (k == CABAC_MAX_BIN) { ++ av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k); ++ return 0; ++ } ++ ++ while (k--) ++ ret += get_cabac_bypass(&lc->cc) << k; ++ return get_cabac_bypass_sign(&lc->cc, -ret); ++} ++#endif ++ ++static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return get_cabac_bypass_sign(&lc->cc, -1); ++} ++ ++static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) ++{ ++ return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz); ++} ++ ++static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) ++{ ++ return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz); ++} ++ ++static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) ++{ ++ return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz); ++} ++ ++ ++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) { ++ int i =0; ++ ++ while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i)) ++ i++; ++ ++ return i; ++} ++ ++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, ++ int log2_size, int *last_scx_prefix, int *last_scy_prefix) ++{ ++ int i = 0; ++ int max = (log2_size << 1) - 1; ++ int ctx_offset, ctx_shift; ++ ++ if (!c_idx_nz) { ++ ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2); ++ ctx_shift = (log2_size + 1) >> 2; ++ } else { ++ ctx_offset = 15; ++ ctx_shift = log2_size - 2; ++ } ++ while (i < max && ++ GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset)) ++ i++; ++ *last_scx_prefix = i; ++ ++ i = 0; ++ while (i < max && ++ GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset)) ++ i++; ++ *last_scy_prefix = i; ++} ++ ++static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc, ++ int last_significant_coeff_prefix) ++{ ++ int i; ++ int length = (last_significant_coeff_prefix >> 1) - 1; ++ int value = get_cabac_bypass(&lc->cc); ++ ++ for (i = 1; i < length; i++) ++ value = (value << 1) | get_cabac_bypass(&lc->cc); ++ return value; ++} ++ ++static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg) ++{ ++ int inc; ++ ++ inc = (ctx_cg != 0) + (c_idx_nz << 1); ++ ++ return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc); ++} ++ ++static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset) ++{ ++ return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset); ++} ++ ++#if !USE_BY22 ++#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r) ++#endif ++ ++ ++#ifndef coeff_abs_level_remaining_decode_bypass ++static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param) ++{ ++ uint32_t y; ++ unsigned int prefix; ++ unsigned int last_coeff_abs_level_remaining; ++ unsigned int n; ++ ++ y = get_cabac_by22_peek(c); ++ prefix = hevc_clz32(~y); ++ // y << prefix will always have top bit 0 ++ ++ if (prefix < 3) { ++ const unsigned int suffix = (y << prefix) >> (31 - rice_param); ++ last_coeff_abs_level_remaining = (prefix << rice_param) + suffix; ++ n = prefix + 1 + rice_param; ++ } ++ else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2) ++ { ++ const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param)); ++ ++ last_coeff_abs_level_remaining = (2 << rice_param) + suffix; ++ n = prefix * 2 + rice_param - 2; ++ } ++ else { ++ unsigned int suffix; ++ ++ get_cabac_by22_flush(c, prefix, y); ++ y = get_cabac_by22_peek(c); ++ ++ suffix = (y | 0x80000000) >> (34 - (prefix + rice_param)); ++ last_coeff_abs_level_remaining = (2 << rice_param) + suffix; ++ n = prefix + rice_param - 2; ++ } ++ ++ get_cabac_by22_flush(c, n, y); ++ ++ return last_coeff_abs_level_remaining; ++} ++#endif ++ ++static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param) ++{ ++ int prefix = 0; ++ int suffix = 0; ++ int last_coeff_abs_level_remaining; ++ int i; ++ ++ while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c)) ++ prefix++; ++ if (prefix == CABAC_MAX_BIN) { ++// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix); ++ return 0; ++ } ++ ++ if (prefix < 3) { ++ for (i = 0; i < rc_rice_param; i++) ++ suffix = (suffix << 1) | get_cabac_bypass(c); ++ last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix; ++ } else { ++ int prefix_minus3 = prefix - 3; ++ for (i = 0; i < prefix_minus3 + rc_rice_param; i++) ++ suffix = (suffix << 1) | get_cabac_bypass(c); ++ last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1) ++ << rc_rice_param) + suffix; ++ } ++ ++ return last_coeff_abs_level_remaining; ++} ++ ++#if !USE_BY22 ++#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode ++static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb) ++{ ++ unsigned int i; ++ uint32_t ret = 0; ++ ++ for (i = 0; i < nb; i++) ++ ret = (ret << 1) | get_cabac_bypass(c); ++ ++ return ret << (32 - nb); ++} ++#endif ++ ++#ifndef coeff_sign_flag_decode_bypass ++static inline uint32_t coeff_sign_flag_decode_bypass(CABACContext * const c, const unsigned int nb) ++{ ++ uint32_t y; ++ y = get_cabac_by22_peek(c); ++ get_cabac_by22_flush(c, nb, y); ++ return y & ~(0xffffffffU >> nb); ++} ++#endif ++ ++ ++#ifndef get_cabac_greater1_bits ++static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n, ++ uint8_t * const state0) ++{ ++ unsigned int i; ++ unsigned int rv = 0; ++ for (i = 0; i != n; ++i) { ++ const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3; ++ const unsigned int b = get_cabac(c, state0 + idx); ++ rv = (rv << 1) | b; ++ } ++ return rv; ++} ++#endif ++ ++ ++// N.B. levels returned are the values assuming coeff_abs_level_remaining ++// is uncoded, so 1 must be added if it is coded. sum_abs also reflects ++// this version of events. ++static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels, ++ int * const pprev_subset_coded, int * const psum, ++ const unsigned int idx0_gt1, const unsigned int idx_gt2) ++{ ++ CABACContext * const c = &lc->cc; ++ uint8_t * const state0 = lc->cabac_state + idx0_gt1; ++ uint8_t * const state_gt2 = lc->cabac_state + idx_gt2; ++ unsigned int rv; ++ unsigned int i; ++ const unsigned int n = FFMIN(n_end, 8); ++ ++ // Really this is i != n but the simple unconditional loop is cheaper ++ // and faster ++ for (i = 0; i != 8; ++i) ++ levels[i] = 1; ++ ++ rv = get_cabac_greater1_bits(c, n, state0); ++ ++ *pprev_subset_coded = 0; ++ *psum = n; ++ ++ rv <<= (32 - n); ++ if (rv != 0) ++ { ++ *pprev_subset_coded = 1; ++ *psum = n + 1; ++ i = hevc_clz32(rv); ++ levels[i] = 2; ++ if (get_cabac(c, state_gt2) == 0) ++ { ++ // Unset first coded bit ++ rv &= ~(0x80000000U >> i); ++ } ++ } ++ ++ if (n_end > 8) { ++ const unsigned int g8 = n_end - 8; ++ rv |= ((1 << g8) - 1) << (24 - g8); ++ for (i = 0; i != g8; ++i) { ++ levels[i + 8] = 0; ++ } ++ } ++ ++ return rv; ++} ++ ++// extended_precision_processing_flag must be false given we are ++// putting the result into a 16-bit array ++// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining) ++// scale_m is uint8_t ++// ++// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12) ++// or it can be 2 (if we have transquant_bypass) ++// shift is set to one less than we really want but would normally be ++// s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5? ++// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6 ++// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient) ++// to achieve it ++ ++#ifndef trans_scale_sat ++static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift) ++{ ++ return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1); ++} ++#endif ++ ++ ++#ifndef update_rice ++static inline void update_rice(uint8_t * const stat_coeff, ++ const unsigned int last_coeff_abs_level_remaining, ++ const unsigned int c_rice_param) ++{ ++ const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param; ++ if (x >= 6) ++ (*stat_coeff)++; ++ else if (x == 0 && *stat_coeff > 0) ++ (*stat_coeff)--; ++} ++#endif ++ ++ ++// n must be > 0 on entry ++#ifndef get_cabac_sig_coeff_flag_idxs ++static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0, ++ unsigned int n, ++ const uint8_t const * ctx_map, ++ uint8_t * p) ++{ ++ do { ++ if (get_cabac(c, state0 + ctx_map[n])) ++ *p++ = n; ++ } while (--n != 0); ++ return p; ++} ++#endif ++ ++ ++static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0, ++ unsigned int n, ++ const uint8_t * ctx_map, // const ptr here but not in asm ++ uint8_t * const flag_idx) ++{ ++ int rv; ++ ++ rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx; ++ ++ return rv; ++} ++ ++#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\ ++ x0, x1, x2, x3,\ ++ x4, x5, x6, x7,\ ++ x8, x9, x10, x11,\ ++ x12, x13, x14, x15} ++ ++#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\ ++ x0, x4, x8, x12,\ ++ x1, x5, x9, x13,\ ++ x2, x6, x10, x14,\ ++ x3, x7, x11, x15} ++ ++#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\ ++ x0, x4, x1, x8,\ ++ x5, x2, x12, x9,\ ++ x6, x3, x13, x10,\ ++ x7, x14, x11, x15} ++ ++ ++static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz, ++ uint8_t * const significant_coeff_group_flag, ++ const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg, ++ int * const pPrev_sig) ++{ ++ while (--i >= 0) { ++ uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag; ++ const unsigned int x_cg = scan_x_cg[i]; ++ ++ // For the flag decode we only care about Z/NZ but ++ // we use the full Right * 2 + Down when calculating ++ // significant coeff flags so we obtain it here. ++ // ++ // The group flag array is one longer than it needs to ++ // be so we don't need to check for y_cg limits ++ const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1); ++ ++ if (i == 0 || ++ significant_coeff_group_flag_decode(lc, c_idx_nz, prev_sig)) ++ { ++ gf_y[0] |= (1 << x_cg); ++ *pPrev_sig = prev_sig; ++ break; ++ } ++ } ++ ++ return i; ++} ++ ++static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb, ++ const unsigned int log2_trafo_size, const unsigned int c_idx, ++ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs) ++{ ++ const AVFrame * const frame = s->frame; ++ const unsigned int stride = frame_stride1(s->frame, c_idx); ++ const unsigned int x = x0 >> ctx_hshift(s, c_idx); ++ const unsigned int y = y0 >> ctx_vshift(s, c_idx); ++ const int is_sliced = 1; // av_rpi_is_sand_frame(frame); ++ uint8_t * const dst = !is_sliced ? ++ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : ++ c_idx == 0 ? ++ av_rpi_sand_frame_pos_y(frame, x, y) : ++ av_rpi_sand_frame_pos_c(frame, x, y); ++ ++ const unsigned int i = jb->intra.n; ++ HEVCPredCmd *const pc = jb->intra.cmds + i - 1; ++ ++ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && ++ pc->ta.dst == dst) ++ { ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->ta.stride == stride); ++ ++ pc->type = RPI_PRED_ADD_RESIDUAL_C; ++ } ++ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && ++ pc->dc.dst == dst) ++ { ++ const int16_t dc = (int16_t)pc->dc.dc; // Discard top bits ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->dc.stride == stride); ++ ++ // Rewrite as add residual - must rewrite all fields as different union member ++ pc->type = RPI_PRED_ADD_RESIDUAL_V; ++ pc->ta.buf = coeffs; ++ pc->ta.dst = dst; ++ pc->ta.stride = stride; ++ pc->ta.dc = dc; ++ } ++ else ++ { ++ HEVCPredCmd * const cmd = pc + 1; ++ jb->intra.n = i + 1; ++ ++ cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0); ++ cmd->size = log2_trafo_size; ++ cmd->ta.buf = coeffs; ++ cmd->ta.dst = dst; ++ cmd->ta.stride = stride; ++ cmd->ta.dc = 0; ++ } ++} ++ ++ ++static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const unsigned int log2_trafo_size, const unsigned int c_idx, ++ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs) ++{ ++ const AVFrame * const frame = s->frame; ++ const unsigned int stride = frame_stride1(s->frame, c_idx); ++ const unsigned int x = x0 >> ctx_hshift(s, c_idx); ++ const unsigned int y = y0 >> ctx_vshift(s, c_idx); ++ const int is_sliced = 1; ++ uint8_t * const dst = !is_sliced ? ++ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : ++ c_idx == 0 ? ++ av_rpi_sand_frame_pos_y(frame, x, y) : ++ av_rpi_sand_frame_pos_c(frame, x, y); ++ ++ const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0); ++ const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1); ++ ++ const unsigned int i = jb->intra.n; ++ HEVCPredCmd *const pc = jb->intra.cmds + i - 1; ++ ++ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && ++ pc->ta.dst == dst) ++ { ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->ta.stride == stride); ++ ++ pc->ta.dc = (int16_t)coeff; ++ } ++ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && ++ pc->dc.dst == dst) ++ { ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->dc.stride == stride && ++ (pc->dc.dc & ~0xffff) == 0); ++ ++ pc->dc.dc |= (coeff << 16); ++ } ++ else ++ { ++ HEVCPredCmd * const cmd = pc + 1; ++ jb->intra.n = i + 1; ++ ++ cmd->type = RPI_PRED_ADD_DC + c_idx; ++ cmd->size = log2_trafo_size; ++ cmd->dc.dst = dst; ++ cmd->dc.stride = stride; ++ cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff; ++ } ++} ++ ++ ++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x0, const int y0, ++ const int log2_trafo_size, const enum ScanType scan_idx, ++ const int c_idx) ++{ ++ int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag; ++ ++ int last_significant_coeff_x, last_significant_coeff_y; ++ int num_coeff = 0; ++ int prev_subset_coded = 0; ++ ++ int num_last_subset; ++ int x_cg_last_sig, y_cg_last_sig; ++ ++ const uint8_t *scan_x_cg, *scan_y_cg; ++ const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2]; ++ ++ int use_vpu; ++#if RPI_COMPRESS_COEFFS ++ int num_nonzero = 0; ++ int use_compress = 0; ++ int *coeffs32; ++#endif ++ int use_dc = 0; ++ int16_t *coeffs; ++ uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero ++ int explicit_rdpcm_flag = 0; ++ int explicit_rdpcm_dir_flag; ++ ++ int i; ++ int shift,scale; ++ const uint8_t *scale_matrix = NULL; ++ uint8_t dc_scale; ++ const int c_idx_nz = (c_idx != 0); ++ const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; ++ int prev_sig = 0; ++ int may_hide_sign; ++ ++ int16_t dummy_coeffs[16]; ++ ++ // Derive QP for dequant ++ if (!lc->cu.cu_transquant_bypass_flag) { ++ may_hide_sign = s->ps.pps->sign_data_hiding_flag; ++ ++ if (s->ps.pps->transform_skip_enabled_flag && ++ log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) { ++ int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz); ++ if (transform_skip_flag) { ++ trans_skip_or_bypass = 1; ++ if (lc->cu.pred_mode == MODE_INTRA && ++ s->ps.sps->implicit_rdpcm_enabled_flag && ++ (pred_mode_intra == 10 || pred_mode_intra == 26)) { ++ may_hide_sign = 0; ++ } ++ } ++ } ++ ++ { ++ static const uint8_t level_scale[8] = { ++ 40, 45, 51, 57, 64, 72, 0, 0 // Pad to 8 ++ }; ++ const int qp6 = (int8_t)lc->tu.qp_divmod6[c_idx][lc->qp_y]; ++ ++ // Shift is set to one less than will actually occur as the scale ++ // and saturate step adds 1 and then shifts right again ++ scale = level_scale[qp6 & 7]; ++// shift = s->ps.sps->bit_depth + log2_trafo_size - (int)(qp6 >> 3); ++ shift = log2_trafo_size - (qp6 >> 3); ++ ++ if (shift < 0) { ++ scale <<= -shift; ++ shift = 0; ++ } ++ } ++ ++ if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) { ++ const ScalingList * const sl = s->ps.pps->scaling_list_data_present_flag ? ++ &s->ps.pps->scaling_list : &s->ps.sps->scaling_list; ++ const unsigned int matrix_id = ++ lc->cu.pred_mode != MODE_INTRA ? 3 + c_idx : c_idx; ++ ++ scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id]; ++ dc_scale = scale_matrix[0]; ++ if (log2_trafo_size >= 4) ++ dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id]; ++ } ++ else ++ { ++ static const uint8_t sixteen_scale[64] = { ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16 ++ }; ++ scale_matrix = sixteen_scale; ++ dc_scale = 16; ++ } ++ } else { ++ static const uint8_t unit_scale[64] = { ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ }; ++ scale_matrix = unit_scale; ++ shift = 0; ++ scale = 2; // We will shift right to kill this ++ dc_scale = 1; ++ ++ may_hide_sign = 0; ++ } ++ ++ ++ ++ ++ if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag && ++ trans_skip_or_bypass) { ++ explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz); ++ if (explicit_rdpcm_flag) { ++ may_hide_sign = 0; ++ explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz); ++ } ++ } ++ ++ last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size, ++ &last_significant_coeff_x, &last_significant_coeff_y); ++ ++ if (last_significant_coeff_x > 3) { ++ int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x); ++ last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) * ++ (2 + (last_significant_coeff_x & 1)) + ++ suffix; ++ } ++ ++ if (last_significant_coeff_y > 3) { ++ int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y); ++ last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) * ++ (2 + (last_significant_coeff_y & 1)) + ++ suffix; ++ } ++ ++ if (scan_idx == SCAN_VERT) ++ FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y); ++ ++ x_cg_last_sig = last_significant_coeff_x >> 2; ++ y_cg_last_sig = last_significant_coeff_y >> 2; ++ ++ switch (scan_idx) { ++ case SCAN_DIAG: { ++ int last_x_c = last_significant_coeff_x & 3; ++ int last_y_c = last_significant_coeff_y & 3; ++ ++ num_coeff = diag_scan4x4_inv[last_y_c][last_x_c]; ++ ++ switch (log2_trafo_size) { ++ case 2: ++ scan_x_cg = scan_1x1; ++ scan_y_cg = scan_1x1; ++ break; ++ case 3: ++ num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4; ++ scan_x_cg = diag_scan2x2_x; ++ scan_y_cg = diag_scan2x2_y; ++ break; ++ case 4: ++ num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4; ++ scan_x_cg = ff_hevc_rpi_diag_scan4x4_x; ++ scan_y_cg = ff_hevc_rpi_diag_scan4x4_y; ++ break; ++ case 5: ++ default: ++ num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4; ++ scan_x_cg = ff_hevc_rpi_diag_scan8x8_x; ++ scan_y_cg = ff_hevc_rpi_diag_scan8x8_y; ++ break; ++ } ++ break; ++ } ++ case SCAN_HORIZ: ++ scan_x_cg = horiz_scan2x2_x; ++ scan_y_cg = horiz_scan2x2_y; ++ num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x]; ++ break; ++ default: //SCAN_VERT ++ scan_x_cg = horiz_scan2x2_y; ++ scan_y_cg = horiz_scan2x2_x; ++ num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y]; ++ break; ++ } ++ num_coeff++; ++ num_last_subset = (num_coeff - 1) >> 4; ++ ++ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant ++ ++ { ++ const unsigned int ccount = 1 << (log2_trafo_size * 2); ++ const int special = trans_skip_or_bypass /* || lc->tu.cross_pf */; // These need special processing ++ use_vpu = 0; ++ use_dc = (num_coeff == 1) && !special && ++ !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2); ++ ++ if (use_dc) { ++ // Just need a little empty space ++ coeffs = dummy_coeffs; ++ // No need to clear ++ } ++ else ++ { ++ use_vpu = !special && log2_trafo_size >= 4; ++#if RPI_COMPRESS_COEFFS ++ use_compress = use_vpu && lc->jb0->coeffs.s[log2_trafo_size - 2].packed; ++#endif ++ coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount); ++#if RPI_COMPRESS_COEFFS ++ coeffs32 = (int*)coeffs; ++ if (!use_compress) ++#endif ++#if HAVE_NEON ++ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2); ++#else ++ memset(coeffs, 0, ccount * sizeof(int16_t)); ++#endif ++ } ++ } ++ ++ i = num_last_subset; ++ do { ++ int implicit_non_zero_coeff = 0; ++ int n_end; ++ ++ uint8_t significant_coeff_flag_idx[16]; ++ unsigned int nb_significant_coeff_flag = 0; ++ ++ if (i == num_last_subset) { ++ // First time through ++ int last_scan_pos = num_coeff - (i << 4) - 1; ++ n_end = last_scan_pos - 1; ++ significant_coeff_flag_idx[0] = last_scan_pos; ++ nb_significant_coeff_flag = 1; ++ } else { ++ n_end = 15; ++ implicit_non_zero_coeff = (i != 0); ++ } ++ ++ if (n_end >= 0) { ++ static const uint8_t ctx_idx_maps_ts2[3][16] = { ++ D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2 ++ H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2 ++ V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8) // log2_trafo_size == 2 ++ }; ++ // N.B. prev_sig = Right * 2 + Down ++ static const uint8_t ctx_idx_maps[3][4][16] = { ++ { ++ D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 ++ D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 ++ D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 ++ D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default ++ }, ++ { ++ H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 ++ H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 ++ H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 ++ H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default ++ }, ++ { ++ V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 ++ V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 ++ V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 ++ V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default ++ } ++ }; ++ const uint8_t *ctx_idx_map_p; ++ int scf_offset = 0; ++ ++ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) { ++ ctx_idx_map_p = ctx_idx_maps[0][3]; ++ scf_offset = 40 + c_idx_nz; ++ } else { ++ if (c_idx_nz != 0) ++ scf_offset = 27; ++ ++ if (log2_trafo_size == 2) { ++ ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx]; ++ } else { ++ ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig]; ++ if (!c_idx_nz) { ++ if (i != 0) ++ scf_offset += 3; ++ ++ if (log2_trafo_size == 3) { ++ scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15; ++ } else { ++ scf_offset += 21; ++ } ++ } else { ++ if (log2_trafo_size == 3) ++ scf_offset += 9; ++ else ++ scf_offset += 12; ++ } ++ } ++ } ++ ++ if (n_end > 0) { ++ int cnt = get_sig_coeff_flag_idxs(&lc->cc, ++ lc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset, ++ n_end, ctx_idx_map_p, ++ significant_coeff_flag_idx + nb_significant_coeff_flag); ++ ++ nb_significant_coeff_flag += cnt; ++ if (cnt != 0) { ++ implicit_non_zero_coeff = 0; ++ } ++ } ++ ++ if (implicit_non_zero_coeff == 0) { ++ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) { ++ scf_offset = 42 + c_idx_nz; ++ } else { ++ if (i == 0) { ++ scf_offset = c_idx_nz ? 27 : 0; ++ } else { ++ scf_offset = 2 + scf_offset; ++ } ++ } ++ if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) { ++ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; ++ nb_significant_coeff_flag++; ++ } ++ } else { ++ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; ++ nb_significant_coeff_flag++; ++ } ++ } ++#if RPI_COMPRESS_COEFFS ++ if (use_compress && (nb_significant_coeff_flag + num_nonzero + 1 >= (1<<(2*log2_trafo_size-1)))) { // Overflow when half-full! ++ int16_t temp[32*32]; ++ const unsigned int ccount = 1 << (log2_trafo_size * 2); ++ lc->jb0->coeffs.s[log2_trafo_size - 2].packed = 0; ++ lc->jb0->coeffs.s[log2_trafo_size - 2].packed_n = lc->jb0->coeffs.s[log2_trafo_size - 2].n - ccount; // Don't want to unpack the last buffer ++ memcpy(temp, coeffs, sizeof(int)*num_nonzero); ++ coeffs32 = (int *)temp; ++ memset(coeffs, 0, ccount * sizeof(int16_t)); ++ num_nonzero--; ++ while (num_nonzero >= 0) { ++ const unsigned int res = coeffs32[num_nonzero]; ++ const unsigned int offset = res & 0xffff; ++ coeffs[ offset ] = res >> 16; ++ num_nonzero--; ++ } ++ use_compress = 0; ++ } ++#endif ++ ++ if (nb_significant_coeff_flag != 0) { ++ const unsigned int gt1_idx_delta = (c_idx_nz << 2) | ++ ((i != 0 && !c_idx_nz) ? 2 : 0) | ++ prev_subset_coded; ++ const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] + ++ (gt1_idx_delta << 2); ++ const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + ++ gt1_idx_delta; ++ ++ const unsigned int x_cg = scan_x_cg[i]; ++ const unsigned int y_cg = scan_y_cg[i]; ++ int16_t * const blk_coeffs = coeffs + ++ ((x_cg + (y_cg << log2_trafo_size)) << 2); ++ // This calculation is 'wrong' for log2_traffo_size == 2 ++ // but that doesn't matter as in this case x_cg & y_cg ++ // are always 0 so result is correct (0) anyway ++ const uint8_t * const blk_scale = scale_matrix + ++ (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size))); ++ ++ // * The following code block doesn't deal with these flags: ++ // (nor did the one it replaces) ++ // ++ // cabac_bypass_alignment_enabled_flag ++ // This should be easy but I can't find a test case ++ // extended_precision_processing_flag ++ // This can extend the required precision past 16bits ++ // so is probably tricky - also no example found yet ++ ++#if USE_N_END_1 ++ if (nb_significant_coeff_flag == 1) { ++ // There is a small gain to be had from special casing the single ++ // transform coefficient case. The reduction in complexity ++ // makes up for the code duplicatioon. ++ ++ int trans_coeff_level = 1; ++ int coeff_sign_flag; ++ int coded_val = 0; ++ ++ // initialize first elem of coeff_bas_level_greater1_flag ++ prev_subset_coded = 0; ++ ++ if (get_cabac(&lc->cc, lc->cabac_state + idx0_gt1 + 1)) { ++ trans_coeff_level = 2; ++ prev_subset_coded = 1; ++ coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2); ++ } ++ ++ // Probably not worth the overhead of starting by22 for just one value ++ coeff_sign_flag = get_cabac_bypass(&lc->cc); ++ ++ if (coded_val) ++ { ++ if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) { ++ trans_coeff_level = 3 + coeff_abs_level_remaining_decode(&lc->cc, 0); ++ } else { ++ uint8_t * const stat_coeff = ++ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1); ++ const unsigned int c_rice_param = *stat_coeff >> 2; ++ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param); ++ ++ trans_coeff_level = 3 + last_coeff_abs_level_remaining; ++ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); ++ } ++ } ++ ++ { ++ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0]; ++ const int k = (int32_t)(coeff_sign_flag << 31) >> 31; ++ const unsigned int scale_m = blk_scale[xy_off->scale]; ++ const int res = trans_scale_sat( ++ (trans_coeff_level ^ k) - k, // Apply sign ++ scale, ++ i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m, ++ shift); ++#if RPI_COMPRESS_COEFFS ++ if (use_compress) ++ coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs); ++ else ++#endif ++ blk_coeffs[xy_off->coeff] = res; ++ } ++ } ++ else ++#endif ++ { ++ int sign_hidden = may_hide_sign; ++ int levels[16]; // Should be able to get away with int16_t but that fails some tests ++ uint32_t coeff_sign_flags; ++ uint32_t coded_vals = 0; ++ // Sum(abs(level[])) ++ // In fact we only need the bottom bit and in some future ++ // version that may be all we calculate ++ unsigned int sum_abs; ++ ++ coded_vals = get_greaterx_bits(lc, nb_significant_coeff_flag, levels, ++ &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2); ++ ++ if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3) ++ sign_hidden = 0; ++ ++ // -- Start bypass block ++ ++ bypass_start(&lc->cc); ++ ++ coeff_sign_flags = coeff_sign_flag_decode_bypass(&lc->cc, nb_significant_coeff_flag - sign_hidden); ++ ++ if (coded_vals != 0) ++ { ++ const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag; ++ uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL : ++ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1); ++ int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2; ++ int * level = levels - 1; ++ ++ do { ++ { ++ const unsigned int z = hevc_clz32(coded_vals) + 1; ++ level += z; ++ coded_vals <<= z; ++ } ++ ++ { ++ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param); ++ const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1; ++ ++ sum_abs += last_coeff_abs_level_remaining + 1; ++ *level = trans_coeff_level; ++ ++ if (stat_coeff != NULL) ++ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); ++ stat_coeff = NULL; ++ ++ if (trans_coeff_level > (3 << c_rice_param) && ++ (c_rice_param < 4 || rice_adaptation_enabled)) ++ ++c_rice_param; ++ } ++ } while (coded_vals != 0); ++ } ++ ++ // sign_hidden = 0 or 1 so we can combine the tests ++ if ((sign_hidden & sum_abs) != 0) { ++ levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1]; ++ } ++ ++ bypass_finish(&lc->cc); ++ ++ // -- Finish bypass block ++ ++ // Scale loop ++ { ++ int m = nb_significant_coeff_flag - 1; ++ ++ // Deal with DC component (if any) first ++ if (i == 0 && significant_coeff_flag_idx[m] == 0) ++ { ++ const int k = (int32_t)(coeff_sign_flags << m) >> 31; ++ const int res = trans_scale_sat( ++ (levels[m] ^ k) - k, scale, dc_scale, shift); ++#if RPI_COMPRESS_COEFFS ++ if (use_compress) ++ { ++ coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs); ++ } ++ else ++#endif ++ { ++ blk_coeffs[0] = res; ++ } ++ --m; ++ } ++ ++#if !USE_N_END_1 ++ // If N_END_1 set then m was at least 1 initially ++ if (m >= 0) ++#endif ++ { ++ do { ++ const xy_off_t * const xy_off = scan_xy_off + ++ significant_coeff_flag_idx[m]; ++ const int k = (int32_t)(coeff_sign_flags << m) >> 31; ++ const int res = trans_scale_sat( ++ (levels[m] ^ k) - k, ++ scale, ++ blk_scale[xy_off->scale], ++ shift); ++#if RPI_COMPRESS_COEFFS ++ if (use_compress) { ++ coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs); ++ } else ++#endif ++ blk_coeffs[xy_off->coeff] = res; ++ } while (--m >= 0); ++ } ++ } ++ ++ } ++ } ++ } while ((i = next_subset(lc, i, c_idx_nz, ++ significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 && ++ !cabac_overflow(&lc->cc)); ++ ++ if (lc->cu.cu_transquant_bypass_flag) { ++ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && ++ (pred_mode_intra == 10 || pred_mode_intra == 26))) { ++ int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag; ++ ++ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); ++ } ++ } else { ++ if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass ++ int rot = s->ps.sps->transform_skip_rotation_enabled_flag && ++ log2_trafo_size == 2 && ++ lc->cu.pred_mode == MODE_INTRA; ++ if (rot) { ++ for (i = 0; i < 8; i++) ++ FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]); ++ } ++ ++ s->hevcdsp.dequant(coeffs, log2_trafo_size); ++ ++ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && ++ lc->cu.pred_mode == MODE_INTRA && ++ (pred_mode_intra == 10 || pred_mode_intra == 26))) { ++ int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26); ++ ++ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); ++ } ++ } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) { ++ s->hevcdsp.transform_4x4_luma(coeffs); ++ } ++ else if (!use_vpu) ++ { ++ int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); ++ if (max_xy == 0) ++ { ++ if (use_dc) ++ rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs); ++ else ++ s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs); ++ } ++ else { ++ int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4; ++ if (max_xy < 4) ++ col_limit = FFMIN(4, col_limit); ++ else if (max_xy < 8) ++ col_limit = FFMIN(8, col_limit); ++ else if (max_xy < 12) ++ col_limit = FFMIN(24, col_limit); ++ s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit); ++ } ++ } ++ } ++ ++#if 0 ++ // Mildly rotted - we support no mode where cross is valid ++ if (lc->tu.cross_pf) { ++ int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer; ++ const int ccount = 1 << (log2_trafo_size * 2); ++ ++ for (i = 0; i < ccount; i++) { ++ coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); ++ } ++ } ++#endif ++ ++ if (!use_dc) { ++#if RPI_COMPRESS_COEFFS ++ if (use_compress) { ++ coeffs32[num_nonzero] = 0; ++ } ++#endif ++ rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs); ++ } ++} ++ ++#if !USE_BY22 ++// Stores results to lc ++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc) ++{ ++ int x = abs_mvd_greater0_flag_decode(lc); ++ int y = abs_mvd_greater0_flag_decode(lc); ++ ++ if (x) ++ x += abs_mvd_greater1_flag_decode(lc); ++ if (y) ++ y += abs_mvd_greater1_flag_decode(lc); ++ ++ switch (x) { ++ case 2: x = mvd_decode(lc); break; ++ case 1: x = mvd_sign_flag_decode(lc); break; ++ case 0: x = 0; break; ++ } ++ ++ switch (y) { ++ case 2: y = mvd_decode(lc); break; ++ case 1: y = mvd_sign_flag_decode(lc); break; ++ case 0: y = 0; break; ++ } ++ return MV_XY(x,y); ++} ++#else ++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc) ++{ ++ int x = abs_mvd_greater0_flag_decode(lc); ++ int y = abs_mvd_greater0_flag_decode(lc); ++ ++ if ((x | y) == 0) ++ return 0; ++ ++ if (x != 0) ++ x += abs_mvd_greater1_flag_decode(lc); ++ if (y != 0) ++ y += abs_mvd_greater1_flag_decode(lc); ++ ++ if ((x | y) == 1) ++ { ++ // Not worth starting BY22 ++ if (x != 0) ++ x = mvd_sign_flag_decode(lc); ++ if (y != 0) ++ y = mvd_sign_flag_decode(lc); ++ } ++ else ++ { ++ CABACContext * const cc = &lc->cc; ++ uint32_t val; ++ uint32_t b; ++ unsigned int n = 0; ++ ++ bypass_start(cc); ++ b = val = get_cabac_by22_peek(cc); ++ ++ if (x == 1) { ++ x = ((int32_t)b >> 31) | 1; ++ n = 1; ++ b <<= 1; ++ } ++ else if (x == 2) { ++ // EG1 so we have (leading one bits + 1) of suffix ++ // This makes prefix & suffix lengths the same ++ const unsigned int k = hevc_clz32(~b) + 1; ++ int s; ++ ++ av_assert2(k <= 15); ++ ++ b <<= k; ++ n = 2 * k + 1; // Includes suffix & sign ++ ++ // We need to have k*2 + 2 (prefix, suffix, sign, y-sign) bits peeked ++ // if we are going to do this without a flush ++ if (k > CABAC_BY22_PEEK_BITS / 2 - 1) ++ { ++ // Need too many bits - flush ++ // n = k ++ get_cabac_by22_flush(cc, k, val); ++ b = val = get_cabac_by22_peek(cc); ++ n = k + 1; ++ } ++ ++ x = (b >> (32 - k)) + (1 << k); ++ b <<= k; ++ s = (int32_t)b >> 31; ++ x = (x ^ s) - s; ++ b <<= 1; ++ ++ // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits) ++ if (y > 1 && n > CABAC_BY22_PEEK_BITS - 15) ++ { ++ get_cabac_by22_flush(cc, n, val); ++ b = val = get_cabac_by22_peek(cc); ++ n = 0; ++ } ++ } ++ ++ if (y == 1) { ++ y = ((int32_t)b >> 31) | 1; ++ ++n; ++ // don't care about b anymore ++ } ++ else if (y == 2) { ++ const unsigned int k = hevc_clz32(~b) + 1; ++ int s; ++ ++ av_assert2(k <= 15); ++ ++ // We need to have k*2 + 1 (prefix, suffix, sign) bits peeked ++ // if we are going to do this without a flush ++ b <<= k; ++ n += 2 * k + 1; ++ ++ if (n > CABAC_BY22_PEEK_BITS) ++ { ++ // Need too many bits - flush ++ get_cabac_by22_flush(cc, n - (k + 1), val); ++ b = val = get_cabac_by22_peek(cc); ++ n = k + 1; ++ } ++ ++ y = (b >> (32 - k)) + (1 << k); ++ s = (int32_t)(b << k) >> 31; ++ y = (y ^ s) - s; ++ // don't care about b anymore ++ } ++ ++ get_cabac_by22_flush(cc, n, val); ++ bypass_finish(cc); ++ } ++ ++ return MV_XY(x, y); ++} ++#endif +diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h +new file mode 100644 +index 0000000000..ca191f00d9 +--- /dev/null ++++ b/libavcodec/rpi_hevc_cabac_fns.h +@@ -0,0 +1,217 @@ ++/* ++ * HEVC CABAC decoding ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2018 John Cox ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ ++#ifndef AVCODEC_RPI_HEVC_CABAC_FNS_H ++#define AVCODEC_RPI_HEVC_CABAC_FNS_H ++ ++#include "config.h" ++#include "rpi_hevcdec.h" ++ ++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc); ++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags); ++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size); ++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH); ++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx); ++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx); ++ ++//int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); ++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x0, const int y0, ++ const int log2_trafo_size, const enum ScanType scan_idx, ++ const int c_idx); ++ ++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc); ++ ++#define HEVC_BIN_SAO_MERGE_FLAG 0 ++#define HEVC_BIN_SAO_TYPE_IDX 1 ++#define HEVC_BIN_SAO_EO_CLASS 2 ++#define HEVC_BIN_SAO_BAND_POSITION 2 ++#define HEVC_BIN_SAO_OFFSET_ABS 2 ++#define HEVC_BIN_SAO_OFFSET_SIGN 2 ++#define HEVC_BIN_END_OF_SLICE_FLAG 2 ++#define HEVC_BIN_SPLIT_CODING_UNIT_FLAG 2 ++#define HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG 5 ++#define HEVC_BIN_SKIP_FLAG 6 ++#define HEVC_BIN_CU_QP_DELTA 9 ++#define HEVC_BIN_PRED_MODE 12 ++#define HEVC_BIN_PART_MODE 13 ++#define HEVC_BIN_PCM_FLAG 17 ++#define HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE 17 ++#define HEVC_BIN_MPM_IDX 18 ++#define HEVC_BIN_REM_INTRA_LUMA_PRED_MODE 18 ++#define HEVC_BIN_INTRA_CHROMA_PRED_MODE 18 ++#define HEVC_BIN_MERGE_FLAG 20 ++#define HEVC_BIN_MERGE_IDX 21 ++#define HEVC_BIN_INTER_PRED_IDC 22 ++#define HEVC_BIN_REF_IDX_L0 27 ++#define HEVC_BIN_REF_IDX_L1 29 ++#define HEVC_BIN_ABS_MVD_GREATER0_FLAG 31 ++#define HEVC_BIN_ABS_MVD_GREATER1_FLAG 33 ++#define HEVC_BIN_ABS_MVD_MINUS2 35 ++#define HEVC_BIN_MVD_SIGN_FLAG 35 ++#define HEVC_BIN_MVP_LX_FLAG 35 ++#define HEVC_BIN_NO_RESIDUAL_DATA_FLAG 36 ++#define HEVC_BIN_SPLIT_TRANSFORM_FLAG 37 ++#define HEVC_BIN_CBF_LUMA 40 ++#define HEVC_BIN_CBF_CB_CR 42 ++#define HEVC_BIN_TRANSFORM_SKIP_FLAG 46 ++#define HEVC_BIN_EXPLICIT_RDPCM_FLAG 48 ++#define HEVC_BIN_EXPLICIT_RDPCM_DIR_FLAG 50 ++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_PREFIX 52 ++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_PREFIX 70 ++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_SUFFIX 88 ++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_SUFFIX 88 ++#define HEVC_BIN_SIGNIFICANT_COEFF_GROUP_FLAG 88 ++#define HEVC_BIN_SIGNIFICANT_COEFF_FLAG 92 ++#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER1_FLAG 136 ++#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER2_FLAG 160 ++#define HEVC_BIN_COEFF_ABS_LEVEL_REMAINING 166 ++#define HEVC_BIN_COEFF_SIGN_FLAG 166 ++#define HEVC_BIN_LOG2_RES_SCALE_ABS 166 ++#define HEVC_BIN_RES_SCALE_SIGN_FLAG 174 ++#define HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG 176 ++#define HEVC_BIN_CU_CHROMA_QP_OFFSET_IDX 177 ++ ++ ++int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state); ++int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c); ++ ++static inline const uint8_t* ff_hevc_rpi_cabac_skip_bytes(CABACContext * const c, int n) { ++ const uint8_t *ptr = c->bytestream; ++ ++ if (c->low & 0x1) ++ ptr--; ++#if CABAC_BITS == 16 ++ if (c->low & 0x1FF) ++ ptr--; ++#endif ++ if ((int) (c->bytestream_end - ptr) < n) ++ return NULL; ++ if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0) ++ return NULL; ++ ++ return ptr; ++} ++ ++static inline int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SAO_MERGE_FLAG); ++} ++ ++static inline int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG); ++} ++ ++static inline int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG); ++} ++ ++static inline int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int ct_depth, ++ const unsigned int x0, const unsigned int y0) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_CODING_UNIT_FLAG + ++ ((s->cabac_stash_left[y0 >> 3] >> 1) > ct_depth) + ++ ((s->cabac_stash_up[x0 >> 3] >> 1) > ct_depth)); ++} ++ ++static inline int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x0, const int y0, const int x_cb, const int y_cb) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG + ++ (s->cabac_stash_left[y0 >> 3] & 1) + ++ (s->cabac_stash_up[x0 >> 3] & 1)); ++} ++ ++static inline int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PRED_MODE); ++} ++ ++static inline int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac_terminate(&lc->cc); ++} ++ ++static inline int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE); ++} ++ ++static inline int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MERGE_FLAG); ++} ++ ++static inline int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MVP_LX_FLAG); ++} ++ ++static inline int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_NO_RESIDUAL_DATA_FLAG); ++} ++ ++static inline int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_CB_CR + trafo_depth); ++} ++ ++static inline int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_LUMA + !trafo_depth); ++} ++ ++static inline int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_TRANSFORM_FLAG + 5 - log2_trafo_size); ++} ++ ++static inline int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_RES_SCALE_SIGN_FLAG + idx); ++} ++ ++ ++ ++#endif ++ +diff --git a/libavcodec/rpi_hevc_data.c b/libavcodec/rpi_hevc_data.c +new file mode 100644 +index 0000000000..341bb77d9d +--- /dev/null ++++ b/libavcodec/rpi_hevc_data.c +@@ -0,0 +1,75 @@ ++/* ++ * HEVC shared tables ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++ ++#include "rpi_hevc_data.h" ++ ++const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = { ++ 0, 0, 1, 0, ++ 1, 2, 0, 1, ++ 2, 3, 1, 2, ++ 3, 2, 3, 3, ++}; ++ ++const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = { ++ 0, 1, 0, 2, ++ 1, 0, 3, 2, ++ 1, 0, 3, 2, ++ 1, 3, 2, 3, ++}; ++ ++const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = { ++ 0, 0, 1, 0, ++ 1, 2, 0, 1, ++ 2, 3, 0, 1, ++ 2, 3, 4, 0, ++ 1, 2, 3, 4, ++ 5, 0, 1, 2, ++ 3, 4, 5, 6, ++ 0, 1, 2, 3, ++ 4, 5, 6, 7, ++ 1, 2, 3, 4, ++ 5, 6, 7, 2, ++ 3, 4, 5, 6, ++ 7, 3, 4, 5, ++ 6, 7, 4, 5, ++ 6, 7, 5, 6, ++ 7, 6, 7, 7, ++}; ++ ++const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = { ++ 0, 1, 0, 2, ++ 1, 0, 3, 2, ++ 1, 0, 4, 3, ++ 2, 1, 0, 5, ++ 4, 3, 2, 1, ++ 0, 6, 5, 4, ++ 3, 2, 1, 0, ++ 7, 6, 5, 4, ++ 3, 2, 1, 0, ++ 7, 6, 5, 4, ++ 3, 2, 1, 7, ++ 6, 5, 4, 3, ++ 2, 7, 6, 5, ++ 4, 3, 7, 6, ++ 5, 4, 7, 6, ++ 5, 7, 6, 7, ++}; +diff --git a/libavcodec/rpi_hevc_data.h b/libavcodec/rpi_hevc_data.h +new file mode 100644 +index 0000000000..0aee673d8b +--- /dev/null ++++ b/libavcodec/rpi_hevc_data.h +@@ -0,0 +1,31 @@ ++/* ++ * HEVC shared data tables ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVC_DATA_H ++#define AVCODEC_RPI_HEVC_DATA_H ++ ++#include ++ ++extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16]; ++extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16]; ++extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64]; ++extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64]; ++ ++#endif /* AVCODEC_RPI_HEVC_DATA_H */ +diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c +new file mode 100644 +index 0000000000..5125d1eb6b +--- /dev/null ++++ b/libavcodec/rpi_hevc_filter.c +@@ -0,0 +1,1210 @@ ++/* ++ * HEVC video decoder ++ * ++ * Originally by: ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2013 Seppo Tomperi ++ * Copyright (C) 2013 Wassim Hamidouche ++ * ++ * Substantially rewritten: ++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++//#define DISABLE_SAO ++//#define DISABLE_DEBLOCK ++//#define DISABLE_STRENGTHS ++// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames) ++//#define DISABLE_DEBLOCK_NONREF ++ ++#include "libavutil/common.h" ++#include "libavutil/internal.h" ++ ++#include "rpi_hevcdec.h" ++ ++#include "bit_depth_template.c" ++ ++#include "rpi_qpu.h" ++#include "rpi_zc.h" ++#include "libavutil/rpi_sand_fns.h" ++ ++#define LUMA 0 ++#define CB 1 ++#define CR 2 ++ ++// tcoffset: -12,12; qp: 0,51; (bs-1)*2: 0,2 ++// so -12,75 overall ++static const uint8_t tctablex[] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -ve quant padding ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12..-1 ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // QP 0...18 ++ 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, // QP 19...37 ++ 5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24, // QP 38...53 ++ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24 // 54..75 ++}; ++#define tctable (tctablex + 12 + 6*8) ++ ++static const uint8_t betatablex[] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -ve quant padding ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12..-1 ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, // QP 0...18 ++ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37 ++ 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, // QP 38...51 ++ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 // 52..73 ++}; ++#define betatable (betatablex + 12 + 6*8) ++ ++static inline int chroma_tc(const HEVCRpiContext * const s, const int qp_y, ++ const int c_idx, const int tc_offset) ++{ ++ return tctable[(int)s->ps.pps->qp_dblk_x[c_idx][qp_y] + tc_offset + 2]; ++} ++ ++static inline int get_qPy_pred(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int xBase, const unsigned int yBase) ++{ ++ const unsigned int ctb_size_mask = (1 << s->ps.sps->log2_ctb_size) - 1; ++ const unsigned int MinCuQpDeltaSizeMask = ~0U << s->ps.pps->log2_min_cu_qp_delta_size; ++ const unsigned int xQgBase = xBase & MinCuQpDeltaSizeMask; ++ const unsigned int yQgBase = yBase & MinCuQpDeltaSizeMask; ++ const unsigned int min_cb_width = s->ps.sps->min_cb_width; ++ const unsigned int x_cb = xQgBase >> s->ps.sps->log2_min_cb_size; ++ const unsigned int y_cb = yQgBase >> s->ps.sps->log2_min_cb_size; ++ const int qPy_pred = lc->qPy_pred; ++ ++ return (((xQgBase & ctb_size_mask) == 0 ? qPy_pred : ++ s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) + ++ ((yQgBase & ctb_size_mask) == 0 ? qPy_pred : ++ s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1; ++} ++ ++// * Only called from bitstream decode in foreground ++// so should be safe ++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase) ++{ ++ const int qp_y = get_qPy_pred(s, lc, xBase, yBase); ++ ++ if (lc->tu.cu_qp_delta != 0) { ++ // ?? I suspect that the -bd_offset here leads to us adding it elsewhere ++ int off = s->ps.sps->qp_bd_offset; ++ lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off, ++ 52 + off) - off; ++ } else ++ lc->qp_y = qp_y; ++} ++ ++static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx) ++{ ++ return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift; ++} ++ ++// "DSP" these? ++static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift) ++{ ++ switch (pixel_shift) ++ { ++ case 2: ++ *(uint32_t *)dst = *(uint32_t *)src; ++ break; ++ case 1: ++ *(uint16_t *)dst = *(uint16_t *)src; ++ break; ++ default: ++ *dst = *src; ++ break; ++ } ++} ++ ++static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src, ++ ptrdiff_t stride_src, int x, int y, int width, int height, ++ int c_idx, int x_ctb, int y_ctb) ++{ ++ const unsigned int sh = pixel_shift(s, c_idx); ++ const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx); ++ const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx); ++ ++ /* copy horizontal edges */ ++ memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh), ++ src, width << sh); ++ memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh), ++ src + stride_src * (height - 1), width << sh); ++ ++ /* copy vertical edges */ ++ ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src); ++ ++ ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src); ++} ++ ++// N.B. Src & dst are swapped as this is a restore! ++// x0 & y0 are in luma coords ++// Width & height are in Y/C pels as appropriate ++// * Clear scope for optimsation here but not used enough to be worth it ++static void restore_tqb_pixels(const HEVCRpiContext * const s, ++ uint8_t *src1, const uint8_t *dst1, ++ const ptrdiff_t stride_src, const ptrdiff_t stride_dst, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int width, const int height, ++ const int c_idx) ++{ ++ if (s->ps.pps->transquant_bypass_enable_flag || ++ s->ps.sps->pcm.loop_filter_disable_flag) ++ { ++ const uint8_t *pcm = s->is_pcm + (x0 >> 6) + (y0 >> 3) * s->ps.sps->pcm_width; ++ int blks_y = height >> (c_idx == 0 ? 3 : 2); ++ const unsigned int bwidth = 8 << s->ps.sps->pixel_shift; // Y & C have the same width in sand ++ const unsigned int bheight = (c_idx == 0) ? 8 : 4; ++ const unsigned int sh = ((x0 >> 3) & 7); ++ const unsigned int mask = (1 << (width >> (c_idx == 0 ? 3 : 2))) - 1; ++ ++ do { ++ unsigned int m = (*pcm >> sh) & mask; ++ uint8_t * bd = src1; ++ const uint8_t * bs = dst1; ++ while (m != 0) { ++ if ((m & 1) != 0) { ++ s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight); ++ } ++ m >>= 1; ++ bs += bwidth; ++ bd += bwidth; ++ } ++ src1 += stride_src * bheight; ++ dst1 += stride_dst * bheight; ++ pcm += s->ps.sps->pcm_width; ++ } while (--blks_y > 0); ++ } ++} ++ ++#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)]) ++ ++static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y) ++{ ++#if SAO_FILTER_N == 5 ++ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; ++#elif SAO_FILTER_N == 6 ++ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; ++#else ++#error Confused by size of sao fn array ++#endif ++ int c_idx; ++ int edges[4]; // 0 left 1 top 2 right 3 bottom ++ int x_ctb = x >> s->ps.sps->log2_ctb_size; ++ int y_ctb = y >> s->ps.sps->log2_ctb_size; ++ int ctb_addr_rs = y_ctb * s->ps.sps->ctb_width + x_ctb; ++ int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs]; ++ RpiSAOParams *sao = &CTB(s->sao, x_ctb, y_ctb); ++ // flags indicating unfilterable edges ++ uint8_t vert_edge[] = { 0, 0 }; ++ uint8_t horiz_edge[] = { 0, 0 }; ++ uint8_t diag_edge[] = { 0, 0, 0, 0 }; ++ uint8_t lfase = CTB(s->filter_slice_edges, x_ctb, y_ctb); ++ uint8_t no_tile_filter = s->ps.pps->tiles_enabled_flag && ++ !s->ps.pps->loop_filter_across_tiles_enabled_flag; ++ uint8_t restore = no_tile_filter || !lfase; ++ uint8_t left_tile_edge = 0; ++ uint8_t right_tile_edge = 0; ++ uint8_t up_tile_edge = 0; ++ uint8_t bottom_tile_edge = 0; ++ const int sliced = 1; ++ const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1); ++ ++ edges[0] = x_ctb == 0; ++ edges[1] = y_ctb == 0; ++ edges[2] = x_ctb == s->ps.sps->ctb_width - 1; ++ edges[3] = y_ctb == s->ps.sps->ctb_height - 1; ++ ++#ifdef DISABLE_SAO ++ return; ++#endif ++ ++ if (restore) { ++ if (!edges[0]) { ++ left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]]; ++ vert_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge; ++ } ++ if (!edges[2]) { ++ right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]]; ++ vert_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge; ++ } ++ if (!edges[1]) { ++ up_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]]; ++ horiz_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge; ++ } ++ if (!edges[3]) { ++ bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]]; ++ horiz_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge; ++ } ++ if (!edges[0] && !edges[1]) { ++ diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge; ++ } ++ if (!edges[1] && !edges[2]) { ++ diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge; ++ } ++ if (!edges[2] && !edges[3]) { ++ diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge; ++ } ++ if (!edges[0] && !edges[3]) { ++ diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge; ++ } ++ } ++ ++ for (c_idx = 0; c_idx < plane_count; c_idx++) { ++ const unsigned int vshift = ctx_vshift(s, c_idx); ++ const unsigned int hshift = ctx_hshift(s, c_idx); ++ const int x0 = x >> hshift; ++ const int y0 = y >> vshift; ++ const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx); ++ const int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift; ++ const int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift; ++ const int width = FFMIN(ctb_size_h, (s->ps.sps->width >> hshift) - x0); ++ const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0); ++ int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1]; ++ ptrdiff_t stride_dst; ++ uint8_t *dst; ++ ++ const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0); ++ const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */; ++ uint8_t * const src = !sliced ? ++ &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] : ++ c_idx == 0 ? ++ av_rpi_sand_frame_pos_y(s->frame, x0, y0) : ++ av_rpi_sand_frame_pos_c(s->frame, x0, y0); ++ const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : ++ !sliced ? src - (1 << sh) : ++ c_idx == 0 ? ++ av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) : ++ av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0); ++ const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : ++ !sliced ? src + (width << sh) : ++ c_idx == 0 ? ++ av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) : ++ av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0); ++ ++ if (sliced && c_idx > 1) { ++ break; ++ } ++ ++// if (c_idx == 1) ++// printf("%d: %dx%d %d,%d: lr=%d\n", c_idx, width, height, x0, y0, wants_lr); ++ ++ switch (sao->type_idx[c_idx]) { ++ case SAO_BAND: ++ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, ++ x_ctb, y_ctb); ++ if (s->ps.pps->transquant_bypass_enable_flag || ++ s->ps.sps->pcm.loop_filter_disable_flag) ++ { ++ // Can't use the edge buffer here as it may be in use by the foreground ++ DECLARE_ALIGNED(64, uint8_t, dstbuf) ++ [2*MAX_PB_SIZE*MAX_PB_SIZE]; ++ dst = dstbuf; ++ stride_dst = 2*MAX_PB_SIZE; ++ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height); ++ if (sliced && c_idx != 0) ++ { ++ s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst, ++ sao->offset_val[1], sao->band_position[1], ++ sao->offset_val[2], sao->band_position[2], ++ width, height); ++ } ++ else ++ { ++ s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst, ++ sao->offset_val[c_idx], sao->band_position[c_idx], ++ width, height); ++ } ++ restore_tqb_pixels(s, src, dst, stride_src, stride_dst, ++ x, y, width, height, c_idx); ++ } else { ++ if (sliced && c_idx != 0) ++ { ++ s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src, ++ sao->offset_val[1], sao->band_position[1], ++ sao->offset_val[2], sao->band_position[2], ++ width, height); ++ } ++ else ++ { ++ s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src, ++ sao->offset_val[c_idx], sao->band_position[c_idx], ++ width, height); ++ } ++ } ++ sao->type_idx[c_idx] = SAO_APPLIED; ++ break; ++ case SAO_EDGE: ++ { ++ const int w = s->ps.sps->width >> hshift; ++ const int h = s->ps.sps->height >> vshift; ++ int top_edge = edges[1]; ++ int bottom_edge = edges[3]; ++ // Can't use the edge buffer here as it may be in use by the foreground ++ DECLARE_ALIGNED(64, uint8_t, dstbuf) ++ [RPI_HEVC_SAO_BUF_STRIDE * (MAX_PB_SIZE + 2) + 64]; ++ ++ stride_dst = RPI_HEVC_SAO_BUF_STRIDE; ++ dst = dstbuf + stride_dst + 32; ++ ++ if (!top_edge) { ++ uint8_t *dst1; ++ int src_idx; ++ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh); ++ ++ dst1 = dst - stride_dst; ++ ++ if (src_l != NULL) { ++ src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] == ++ SAO_APPLIED); ++ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh); ++ } ++ ++ src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] == ++ SAO_APPLIED); ++ memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh); ++ ++ if (src_r != NULL) { ++ src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] == ++ SAO_APPLIED); ++ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh); ++ } ++ } ++ if (!bottom_edge) { ++ uint8_t * const dst1 = dst + height * stride_dst; ++ int src_idx; ++ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh); ++ const unsigned int hoff = height * stride_src; ++ ++ if (src_l != NULL) { ++ src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] == ++ SAO_APPLIED); ++ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh); ++ } ++ ++ src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] == ++ SAO_APPLIED); ++ memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh); ++ ++ if (src_r != NULL) { ++ src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] == ++ SAO_APPLIED); ++ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh); ++ } ++ } ++ if (src_l != NULL) { ++ if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { ++ ff_hevc_rpi_copy_vert(dst - (1 << sh), ++ s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh), ++ sh, height, stride_dst, 1 << sh); ++ } else { ++ ff_hevc_rpi_copy_vert(dst - (1 << sh), ++ src_l, ++ sh, height, stride_dst, stride_src); ++ } ++ } ++ if (src_r != NULL) { ++ if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { ++ ff_hevc_rpi_copy_vert(dst + (width << sh), ++ s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh), ++ sh, height, stride_dst, 1 << sh); ++ } else { ++ ff_hevc_rpi_copy_vert(dst + (width << sh), ++ src_r, ++ sh, height, stride_dst, stride_src); ++ } ++ } ++ ++ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height); ++ ++ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, ++ x_ctb, y_ctb); ++ if (sliced && c_idx != 0) ++ { ++ // Class always the same for both U & V (which is just as well :-)) ++ s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src, ++ sao->offset_val[1], sao->offset_val[2], sao->eo_class[1], ++ width, height); ++ s->hevcdsp.sao_edge_restore_c[restore](src, dst, ++ stride_src, stride_dst, ++ sao, ++ edges, width, ++ height, c_idx, ++ vert_edge, ++ horiz_edge, ++ diag_edge); ++ } ++ else ++ { ++ s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx], ++ sao->eo_class[c_idx], width, height); ++ s->hevcdsp.sao_edge_restore[restore](src, dst, ++ stride_src, stride_dst, ++ sao, ++ edges, width, ++ height, c_idx, ++ vert_edge, ++ horiz_edge, ++ diag_edge); ++ } ++ restore_tqb_pixels(s, src, dst, stride_src, stride_dst, ++ x, y, width, height, c_idx); ++ sao->type_idx[c_idx] = SAO_APPLIED; ++ break; ++ } ++ } ++ } ++ ++#if RPI_ZC_SAND_8_IN_10_BUF ++ if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL && ++ (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2])) ++ { ++ const unsigned int stride1 = frame_stride1(s->frame, 1); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame); ++ const unsigned int xoff = (x >> 8) * stride2 * stride1; ++ const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size); ++ const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1; ++ uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1; ++ const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1; ++ uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1; ++ const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255); ++ const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y; ++ ++// printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size); ++ av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3); ++ av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3); ++ } ++#endif ++} ++ ++// When bits are delivered to deblock we want them ++//#define TL 1 ++//#define TR 2 ++//#define BL 4 ++//#define BR 8 ++ ++// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br ++// so we need to rearrange before passing on ++ ++static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) ++{ ++ const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width; ++ return (pcm[0] | ++ (pcm[1] << 8) | ++ (pcm[s->ps.sps->pcm_width] << 16) | ++ (pcm[s->ps.sps->pcm_width + 1] << 24)) >> ((x >> 3) & 7); ++} ++ ++static inline uint32_t pcm2(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) ++{ ++ const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width; ++ return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7); ++} ++ ++// We cast away const here as we want this to work for both get and set ++static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y) ++{ ++ return (uint32_t *)(bs + ++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0 ++#warning Unexpected masks ++ // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes ++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) & ++ (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) + ++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4 ++#error Stride1 < return size ++#endif ++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) + ++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2); ++} ++ ++static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y) ++{ ++ return (uint8_t *)(bs + ++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) & ++ (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) + ++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) + ++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2); ++} ++ ++ ++// Get block strength ++// Given how we call we will always get within the 32bit boundries ++static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2, ++ unsigned int xl, unsigned int xr, const unsigned int y) ++{ ++ if (xr <= xl) { ++ return 0; ++ } ++ else ++ { ++#if HAVE_ARMV6T2_INLINE ++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0 ++#error This case not yet handled in bs_get32 ++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4 ++#error Stride1 < return size ++#endif ++ uint32_t tmp; ++ __asm__ ( ++ "lsr %[tmp], %[xl], %[xl_shift] \n\t" ++ "rsb %[xr], %[xl], %[xr] \n\t" ++ "mla %[stride2], %[stride2], %[tmp], %[bs] \n\t" ++ "add %[xr], %[xr], #7 \n\t" ++ "lsr %[bs], %[y], %[y_shift1] \n\t" ++ "bic %[xr], %[xr], #7 \n\t" ++ "ubfx %[xl], %[xl], #1, #5 \n\t" ++ "lsr %[xr], %[xr], #1 \n\t" ++ "cmp %[xr], #32 \n\t" ++ "mvn %[tmp], #0 \n\t" ++ "ldr %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t" ++ "lsl %[tmp], %[tmp], %[xr] \n\t" ++ "lsr %[xl], %[bs], %[xl] \n\t" ++ "it ne \n\t" ++ "bicne %[bs], %[xl], %[tmp] \n\t" ++ : // Outputs ++ [bs]"+r"(bs), ++ [stride2]"+r"(stride2), ++ [xl]"+r"(xl), ++ [xr]"+r"(xr), ++ [tmp]"=&r"(tmp) ++ : // Inputs ++ [y]"r"(y), ++ [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT), ++ [y_shift1]"M"(HEVC_RPI_BS_Y_SHR), ++ [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) ++ : // Clobbers ++ "cc" ++ ); ++ return (uint32_t) bs; ++#else ++ const uint32_t a = *bs_ptr32(bs, stride2, xl, y); ++ const unsigned int n = ((xr - xl + 7) & ~7) >> 1; ++ ++ return n == 32 ? a : ++ (a >> ((xl >> 1) & 31)) & ~(~0U << n); ++#endif ++ } ++} ++ ++static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) ++{ ++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0); ++ return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y); ++} ++ ++static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) ++{ ++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0); ++ return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y); ++} ++ ++ ++static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y) ++{ ++ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size; ++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; ++ const unsigned int ctb_size = (1 << log2_ctb_size); ++ const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 : 1); ++ const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size; ++ const DBParams * cb_dbp = s->deblock + ctb_n; ++ const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8); ++ ++ unsigned int cb_x; ++ ++ // Do in CTB-shaped blocks ++ for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++cb_dbp) ++ { ++ const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r); ++ const unsigned int bv_l = FFMAX(cb_x, 8); ++ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r - 8 : cb_x + ctb_size - 9; ++ const unsigned int bh_l = bv_l - 8; ++ unsigned int y; ++ ++ // Main body ++ for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8) ++ { ++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y); ++ ++ const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp; ++ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width; ++ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; ++ ++ if (vbs != 0) ++ { ++ const uint8_t * const tcv = tctable + dbp->tc_offset; ++ const uint8_t * const betav = betatable + dbp->beta_offset; ++ unsigned int pcmfa = pcm2(s, bv_l - 1, y); ++ unsigned int x; ++ ++ for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1) ++ { ++ if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3) ++ { ++ const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), ++ frame_stride1(s->frame, LUMA), ++ betav[qp], ++ ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) | ++ (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16), ++ pcmfa & 3, ++ av_rpi_sand_frame_pos_y(s->frame, x - 4, y)); ++ } ++ } ++ } ++ ++ if (y != 0) ++ { ++ uint32_t hbs; ++ ++ // H left - mostly separated out so we only need a uint32_t hbs ++ if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0) ++ { ++ const unsigned int x = bh_l; ++ const unsigned int pcmfa = pcm4(s, bh_l, y - 1); ++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ const DBParams * const dbph = dbp - 1; ++ const uint8_t * const tc = tctable + dbph->tc_offset + qp; ++ ++ av_assert2(cb_x - bh_l == 8); ++ ++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), ++ frame_stride1(s->frame, LUMA), ++ betatable[qp + dbph->beta_offset], ++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) | ++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16), ++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15)); ++ } ++ ++ // H ++ if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0) // Will give (x <= bh_r) in for loop ++ { ++ unsigned int x; ++ unsigned int pcmfa = pcm4(s, cb_x, y - 1); ++ ++ for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1) ++ { ++ if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0) ++ { ++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ const uint8_t * const tc = tctable + dbp->tc_offset + qp; ++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), ++ frame_stride1(s->frame, LUMA), ++ betatable[qp + dbp->beta_offset], ++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) | ++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16), ++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15)); ++ } ++ } ++ } ++ } ++ ++ } ++ } ++} ++ ++static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) ++{ ++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; ++ const int8_t * const qt = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; ++ return (qt[(x - 1) >> log2_min_cb_size] + qt[x >> log2_min_cb_size] + 1) >> 1; ++} ++ ++static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y) ++{ ++ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size; ++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; ++ const unsigned int ctb_size = (1 << log2_ctb_size); ++ const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 : 8); ++ const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size; ++ const DBParams * dbp = s->deblock + ctb_n; ++ const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8); ++ const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1]; ++ const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2]; ++ ++ unsigned int cb_x; ++ ++ av_assert1((bounds.x & (ctb_size - 1)) == 0); ++ av_assert1((bounds.y & (ctb_size - 1)) == 0); ++ av_assert1(bounds.h <= ctb_size); ++ ++ // Do in CTB-shaped blocks ++ for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++dbp) { ++ const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r); ++ const unsigned int bv_l = FFMAX(cb_x, 16); ++ unsigned int y; ++ ++ // V above ++ if (bounds.y != 0) { ++ // Deblock V up 8 ++ // CTB above current ++ // Top-half only (tc4 & ~0xffff == 0) is special cased in asm ++ const unsigned int y = bounds.y - 8; ++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U; ++ ++ if (vbs != 0) ++ { ++ unsigned int pcmfa = pcm2(s, bv_l - 1, y); ++ const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset; ++ unsigned int x; ++ ++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2) ++ { ++ if ((vbs & 2) != 0 && (~pcmfa & 3) != 0) ++ { ++ const int qp0 = q2h(s, x, y); ++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), ++ frame_stride1(s->frame, 1), ++ tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8), ++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), ++ pcmfa & 3); ++ } ++ } ++ } ++ } ++ ++ for (y = bounds.y; y < b_b; y += 16) ++ { ++ uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) | ++ (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4); ++ ++ // V ++ if (vbs != 0) ++ { ++ unsigned int x; ++ unsigned int pcmfa = ++ (y + 16 > b_b ? ++ pcm2(s, bv_l - 1, y) | 0xffff0000 : ++ pcm4(s, bv_l - 1, y)); ++ const uint8_t * const tc = tctable + 2 + dbp->tc_offset; ++ ++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2) ++ { ++ if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0) ++ { ++ const int qp0 = q2h(s, x, y); ++ const int qp1 = q2h(s, x, y + 8); ++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), ++ frame_stride1(s->frame, 1), ++ ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | ++ ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), ++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), ++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); ++ } ++ } ++ } ++ ++ // H ++ if (y != 0) ++ { ++ uint32_t hbs; ++ const unsigned int bh_l = bv_l - 16; ++ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16; ++ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width; ++ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; ++ ++ // H left - mostly separated out so we only need a uint32_t hbs ++ // Stub is width 8 to the left of bounds, but width 16 internally ++ if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0) ++ { ++ unsigned int pcmfa = pcm4(s, bh_l, y - 1); ++ ++ // Chop off bits we don't want... ++ if (bh_l < bounds.x) { ++ pcmfa |= 0x10001; // TL|BL pre rearrangement ++ hbs &= ~3; // Make BS 0 ++ } ++ ++ // Double check we still want this ++ if (hbs != 0 && (~pcmfa & 0x30003) != 0) ++ { ++ const unsigned int x = bh_l; ++ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1; ++ const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset; ++ ++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), ++ frame_stride1(s->frame, 1), ++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | ++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), ++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); ++ } ++ } ++ ++ // H main ++ if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0) ++ { ++ unsigned int x; ++ unsigned int pcmfa = pcm4(s, cb_x, y - 1); // Might like to mask out far right writes but probably not worth it ++ ++ for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2) ++ { ++ if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0) ++ { ++ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1; ++ const uint8_t * const tc = tctable + 2 + dbp->tc_offset; ++ ++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), ++ frame_stride1(s->frame, 1), ++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | ++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), ++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); ++ } ++ } ++ } ++ } ++ } ++ } ++} ++ ++static inline unsigned int off_boundary(const unsigned int x, const unsigned int log2_n) ++{ ++ return x & ~(~0U << log2_n); ++} ++ ++static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) ++{ ++ av_assert2((y & 7) == 0); ++ ++ // This doesn't have the same simultainious update issues that bsf_stash ++ // does (other threads will have a different y) so we can do it the easy way ++ if ((bsf &= mask) != 0) ++ *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31); ++} ++ ++ ++static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) ++{ ++ // We arrange this in a slightly odd fashion but it lines up with ++ // how we are going to use it in the actual deblock code & it is easier ++ // to do the contortions here than there ++ // ++ // Arrange (LE) {x0y0, x0y4, x8y0, x8,y4}, {x16y0, x16y4, x24y0, x24y4},... ++ ++ av_assert2((x & 7) == 0); ++ ++ if ((bsf &= mask) != 0) ++ { ++ uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y); ++ const unsigned int sh = ((x & 8) | (y & 4)) >> 1; ++ ++ if (mask <= 0xf) ++ { ++ *p |= (bsf << sh); ++ } ++ else ++ { ++ do { ++ *p |= (bsf & 0xf) << sh; ++ p += HEVC_RPI_BS_STRIDE1_BYTES; ++ } while ((bsf >>= 4) != 0); ++ } ++ } ++} ++ ++static inline uint32_t bsf_mv(const HEVCRpiContext * const s, ++ const unsigned int rep, const unsigned int dup, ++ const unsigned int mvf_stride0, ++ const unsigned int mvf_stride1, ++ const RefPicList * const rpl_p, const RefPicList * const rpl_q, ++ const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q) ++{ ++ return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup, ++ mvf_p, mvf_q, ++ rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list, ++ sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1); ++} ++ ++ ++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, ++ const HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int log2_trafo_size, ++ const int is_coded_block) ++{ ++ const HEVCRpiMvField * const mvf_curr = mvf_stash_ptr(s, lc, x0, y0); ++ const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE; ++ const RefPicList * const rpl = s->refPicList; ++ // Rep count for bsf_mv when running with min_pu chuncks ++ const unsigned int log2_rep_min_pu = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size; ++ const unsigned int boundary_flags = s->sh.no_dblk_boundary_flags & lc->boundary_flags; ++ const unsigned int trafo_size = (1U << log2_trafo_size); ++ const uint32_t bsf_mask = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1; ++ const uint32_t bsf_cbf = (bsf_mask & 0x55555555); ++ ++ // Do we cover a pred split line? ++ const int has_x_split = x0 < lc->cu.x_split && x0 + trafo_size > lc->cu.x_split; ++ const int has_y_split = y0 < lc->cu.y_split && y0 + trafo_size > lc->cu.y_split; ++ ++ uint32_t bsf_h; ++ uint32_t bsf_v; ++ ++#ifdef DISABLE_STRENGTHS ++ return; ++#endif ++ ++ // We are always on a size boundary ++ av_assert2((x0 & (trafo_size - 1)) == 0); ++ av_assert2((y0 & (trafo_size - 1)) == 0); ++ // log2_trafo_size not really a transform size; we can have to deal ++ // with size 2^6 blocks ++ av_assert2(log2_trafo_size >= 2 && log2_trafo_size <= 6); ++ ++ // Retrieve and update coded (b0), intra (b1) bs flags ++ // ++ // Store on min width (rather than uint32_t) to avoid possible issues ++ // with another thread on another core running wpp using the same ++ // memory (min CTB = 16 pels = 4 bsf els = 8 bits) ++ // ++ // In bsf BS=2 is represented by 3 as it is much easier to test & set ++ // and the actual deblock code tests for 0 and b1 set/not-set so 2 and ++ // 3 will work the same ++ { ++ // Given where we are called from is_cbf_luma & is_intra will be constant over the block ++ const uint32_t bsf0 = (lc->cu.pred_mode == MODE_INTRA) ? bsf_mask : is_coded_block ? bsf_cbf : 0; ++ uint8_t *const p = s->bsf_stash_up + (x0 >> 4); ++ uint8_t *const q = s->bsf_stash_left + (y0 >> 4); ++ ++ switch (log2_trafo_size) ++ { ++ case 2: ++ case 3: ++ { ++ const unsigned int sh_h = (x0 >> 1) & 7; ++ const unsigned int sh_v = (y0 >> 1) & 7; ++ bsf_h = *p; ++ bsf_v = *q; ++ *p = (bsf_h & ~(bsf_mask << sh_h)) | (bsf0 << sh_h); ++ *q = (bsf_v & ~(bsf_mask << sh_v)) | (bsf0 << sh_v); ++ bsf_h >>= sh_h; ++ bsf_v >>= sh_v; ++ break; ++ } ++ case 4: ++ bsf_h = *p; ++ bsf_v = *q; ++ *p = bsf0; ++ *q = bsf0; ++ break; ++ case 5: ++ bsf_h = *(uint16_t *)p; ++ bsf_v = *(uint16_t *)q; ++ *(uint16_t *)p = bsf0; ++ *(uint16_t *)q = bsf0; ++ break; ++ case 6: ++ default: ++ bsf_h = *(uint32_t *)p; ++ bsf_v = *(uint32_t *)q; ++ *(uint32_t *)p = bsf0; ++ *(uint32_t *)q = bsf0; ++ break; ++ } ++ ++ bsf_h |= bsf0; ++ bsf_v |= bsf0; ++ } ++ ++ // Do Horizontal ++ if ((y0 & 7) == 0) ++ { ++ // Boundary upper ++ if (y0 != 0 && ++ (off_boundary(y0, s->ps.sps->log2_ctb_size) || ++ (boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0)) ++ { ++ // Look at MVs (BS=1) if we don't already has a full set of bs bits ++ if ((~bsf_h & bsf_cbf) != 0 && (y0 == lc->cu.y || y0 == lc->cu.y_split)) ++ { ++ // If we aren't on the top boundary we must be in the middle ++ // and in that case we know where mvf can change ++ const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0; ++ const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ? ++ s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] : ++ rpl; ++ ++ bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), ++ trafo_size >> (log2_min_pu_size + log2_rep), ++ trafo_size >> (log2_min_pu_size + log2_rep), ++ rpl, rpl_top, ++ mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1)); ++ } ++ ++ // Finally put the results into bs ++ hbs_set(s, x0, y0, bsf_mask, bsf_h); ++ } ++ ++ // Max of 1 pu internal split - ignore if not on 8pel boundary ++ if (has_y_split && !off_boundary(lc->cu.y_split, 3)) ++ { ++ const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split); ++ // If we have the x split as well then it must be in the middle ++ const unsigned int log2_rep = has_x_split ? 1 : 0; ++ ++ hbs_set(s, x0, lc->cu.y_split, bsf_mask, ++ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), ++ trafo_size >> (log2_min_pu_size + log2_rep), ++ trafo_size >> (log2_min_pu_size + log2_rep), ++ rpl, rpl, ++ mvf, mvf - MVF_STASH_WIDTH_PU)); ++ } ++ } ++ ++ // And again for vertical - same logic as horizontal just in the other direction ++ if ((x0 & 7) == 0) ++ { ++ // Boundary left ++ if (x0 != 0 && ++ (off_boundary(x0, s->ps.sps->log2_ctb_size) || ++ (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0)) ++ { ++ if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split)) ++ { ++ const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0; ++ const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ? ++ s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] : ++ rpl; ++ ++ bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), ++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), ++ (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep), ++ rpl, rpl_left, ++ mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0)); ++ } ++ ++ vbs_set(s, x0, y0, bsf_mask, bsf_v); ++ } ++ ++ if (has_x_split && !off_boundary(lc->cu.x_split, 3)) ++ { ++ const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0); ++ const unsigned int log2_rep = has_y_split ? 1 : 0; ++ ++ vbs_set(s, lc->cu.x_split, y0, bsf_mask, ++ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), ++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), ++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), ++ rpl, rpl, ++ mvf, mvf - 1)); ++ } ++ } ++} ++ ++#undef LUMA ++#undef CB ++#undef CR ++ ++static inline unsigned int ussub(const unsigned int a, const unsigned int b) ++{ ++ return a < b ? 0 : a - b; ++} ++ ++static inline int cache_boundry(const AVFrame * const frame, const unsigned int x) ++{ ++ return ((x >> av_rpi_sand_frame_xshl(frame)) & ~63) == 0; ++} ++ ++int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot) ++{ ++ const int ctb_size = (1 << s->ps.sps->log2_ctb_size); ++ int x, y; ++ ++ const unsigned int br = bounds.x + bounds.w; ++ const unsigned int bb = bounds.y + bounds.h; ++ ++ const int x_end = (br >= s->ps.sps->width); ++ const int y_end = (bb >= s->ps.sps->height); ++ ++ // Deblock may not touch the edges of the bound as they are still needed ++ // for Intra pred ++ // ++ // Deblock is disabled with a per-slice flag ++ // Given that bounds may cover multiple slices & we dblock outside bounds ++ // anyway we can't avoid deblock using that flag - about the only thing we ++ // could do is have a "no deblock seen yet" flag but it doesn't really ++ // seem worth the effort ++ ++ deblock_y_blk(s, bounds, x_end, y_end); ++ deblock_uv_blk(s, bounds, x_end, y_end); ++ ++ // SAO needs ++ // (a) CTB alignment ++ // (b) Valid pixels all the way around the CTB in particular it needs the DR pixel ++ { ++ const unsigned int xo = bounds.x - ((bounds.x - 16) & ~(ctb_size - 1)); ++ const unsigned int yo = bounds.y - ((bounds.y - 16) & ~(ctb_size - 1)); ++ const unsigned int yt = ussub(bounds.y, yo); ++ const unsigned int yb = y_end ? bb : ussub(bb, yo); ++ const unsigned int xl = ussub(bounds.x, xo); ++ const unsigned int xr = x_end ? br : ussub(br, xo); ++ ++ if (s->ps.sps->sao_enabled) ++ { ++ for (y = yt; y < yb; y += ctb_size) { ++ for (x = xl; x < xr; x += ctb_size) { ++ sao_filter_CTB(s, x, y); ++ } ++ } ++ } ++ ++ // Cache invalidate ++ y = 0; ++ if (xr != 0 && yb != 0) ++ { ++ const unsigned int llen = ++ (av_rpi_sand_frame_stride1(s->frame) >> av_rpi_sand_frame_xshl(s->frame)); ++ const unsigned int mask = ~(llen - 1); ++ const unsigned int il = (xl == 0) ? 0 : (xl - 1) & mask; ++ const unsigned int ir = x_end || !cache_boundry(s->frame, br) ? br : (xr - 1) & mask; ++ const unsigned int it = ussub(yt, 1); ++ const unsigned int ib = y_end ? bb : yb - 1; ++ ++ if (il < ir) { ++ rpi_cache_buf_t cbuf; ++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf); ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ il, it, ir - il, ib - it, ++ ctx_vshift(s, 1), 1, 1); ++ ++ // If we have to commit the right hand tile boundry due to ++ // cache boundry considerations then at EoTile we must commit ++ // that boundry to bottom of tile (bounds) ++ if (ib != bb && ir == br && eot) { ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ br - 1, ib, 1, bb - ib, ++ ctx_vshift(s, 1), 1, 1); ++ } ++ ++ rpi_cache_flush_finish(rfe); ++ ++ if (x_end) ++ y = y_end ? INT_MAX : ib; ++ ++// printf("Flush: %4d,%4d -> %4d,%4d: signal: %d\n", il, it, ir, ib, y - 1); ++ } ++ } ++ } ++ ++ return y; ++} ++ +diff --git a/libavcodec/rpi_hevc_mv.h b/libavcodec/rpi_hevc_mv.h +new file mode 100644 +index 0000000000..6b36f5e737 +--- /dev/null ++++ b/libavcodec/rpi_hevc_mv.h +@@ -0,0 +1,71 @@ ++#ifndef AVCODEC_RPI_HEVC_MV_H ++#define AVCODEC_RPI_HEVC_MV_H ++ ++#include "config.h" ++ ++typedef int32_t MvXY; ++ ++typedef struct HEVCRpiMvField { ++ MvXY xy[2]; ++ int8_t ref_idx[2]; ++ int8_t pred_flag; ++ int8_t dummy; // To 12 bytes ++} HEVCRpiMvField; ++ ++ ++#define MV_X(xy) (((xy) << 16) >> 16) ++#define MV_Y(xy) ((xy) >> 16) ++#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16)) ++ ++#if ARCH_ARM ++#include "arm/rpi_hevc_mv_arm.h" ++#endif ++ ++#ifndef mvxy_add ++static inline MvXY mvxy_add(const MvXY a, const MvXY b) ++{ ++ return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b)); ++} ++#endif ++ ++ ++#ifndef mv_scale_xy ++static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb) ++{ ++ int tx, scale_factor; ++ ++ td = td == 0 ? 1 : av_clip_int8(td); ++ tb = av_clip_int8(tb); ++ tx = (0x4000 + (abs(td) >> 1)) / td; ++ scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12); ++ return MV_XY( ++ av_clip_int16((scale_factor * MV_X(src) + 127 + ++ (scale_factor * MV_X(src) < 0)) >> 8), ++ av_clip_int16((scale_factor * MV_Y(src) + 127 + ++ (scale_factor * MV_Y(src) < 0)) >> 8)); ++} ++#endif ++ ++// 8.3.1 states that the bitstream may not contain poc diffs that do not ++// fit in 16 bits, so given that we don't care about the high bits we only ++// store the low 16 + LT & Inter flags ++ ++#define COL_POC_INTRA 0 ++#define COL_POC_INTER (1 << 16) ++#define COL_POC_LT (1 << 17) ++#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y))) ++#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff)) ++#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0) ++ ++typedef struct ColMv_s { ++ int32_t poc; ++ int32_t xy; ++} ColMv; ++ ++typedef struct ColMvField_s { ++ ColMv L[2]; ++} ColMvField; ++ ++ ++ ++#endif // AVCODEC_RPI_HEVC_MV_H +diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c +new file mode 100644 +index 0000000000..27a9f69525 +--- /dev/null ++++ b/libavcodec/rpi_hevc_mvs.c +@@ -0,0 +1,487 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2013 Anand Meher Kotra ++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "hevc.h" ++#include "rpi_hevcdec.h" ++ ++static av_always_inline int ++is_eq_mer(const unsigned int plevel, ++ const unsigned int xN, const unsigned int yN, ++ const unsigned int xP, const unsigned int yP) ++{ ++ return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0; ++} ++ ++// check if the mv's and refidx are the same between A and B ++static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b) ++{ ++ return a->pred_flag == b->pred_flag && ++ ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) && ++ ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1])); ++ return 0; ++} ++ ++/* ++ * 8.5.3.1.7 temporal luma motion vector prediction ++ */ ++static int temporal_luma_motion_vector(const HEVCRpiContext * const s, ++ const HEVCRpiLocalContext * const lc, const int x0, const int y0, ++ const int nPbW, const int nPbH, const int refIdxLx, ++ MvXY * const mvLXCol, const int X) ++{ ++ int x, y; ++ const ColMv * cmv = NULL; ++ ++ HEVCRpiFrame * const col_ref = s->ref->collocated_ref; ++ const RefPicList * const refPicList = s->refPicList + X; ++ const int cur_lt = refPicList->isLongTerm[refIdxLx]; ++ ++ *mvLXCol = 0; ++ // Unlikely but we might have a col_ref IDR frame! ++ if (col_ref->col_mvf == NULL) ++ return 0; ++ ++ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH); ++ ++ //bottom right collocated motion vector ++ x = x0 + nPbW; ++ y = y0 + nPbH; ++ ++ if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) && ++ y < s->ps.sps->height && ++ x < s->ps.sps->width) ++ { ++ const ColMvField * const col = col_ref->col_mvf + (x >> 4) + ++ (y >> 4) * s->col_mvf_stride; ++ ++ if (col->L[0].poc != COL_POC_INTRA && ++ (col->L[1].poc == COL_POC_INTRA || ++ (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0))) ++ { ++ cmv = col->L + 0; ++ } ++ else if (col->L[1].poc != COL_POC_INTRA) ++ { ++ cmv = col->L + 1; ++ } ++ } ++ ++ // derive center collocated motion vector ++ if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt) ++ { ++ cmv = NULL; ++ x = x0 + (nPbW >> 1); ++ y = y0 + (nPbH >> 1); ++ ++ { ++ const ColMvField * const col = col_ref->col_mvf + (x >> 4) + ++ (y >> 4) * s->col_mvf_stride; ++ ++ if (col->L[0].poc != COL_POC_INTRA && ++ (col->L[1].poc == COL_POC_INTRA || ++ (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0))) ++ { ++ cmv = col->L + 0; ++ } ++ else if (col->L[1].poc != COL_POC_INTRA) ++ { ++ cmv = col->L + 1; ++ } ++ } ++ } ++ ++ if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc)) ++ return 0; ++ ++ { ++ const int col_poc = col_ref->poc; ++ const int ref_poc = refPicList->list[refIdxLx]; ++ ++ *mvLXCol = (cur_lt || ++ cmv->poc == col_poc || ++ COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ? ++ cmv->xy : ++ mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc); ++ } ++ ++ return cmv != NULL; ++} ++ ++static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b) ++{ ++ return b != NULL && compare_mv_ref_idx(a, b); ++} ++ ++ ++ ++/* ++ * 8.5.3.1.2 Derivation process for spatial merging candidates ++ */ ++static inline const HEVCRpiMvField * ++derive_spatial_merge_candidates( ++ const HEVCRpiContext * const s, ++ const HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int nPbW, const unsigned int nPbH, ++ const unsigned int avail, ++ const unsigned int part_idx, ++ const unsigned int merge_idx, ++ HEVCRpiMvField * const mvf_t) ++{ ++ const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N); ++ const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD); ++ ++ const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1); ++ const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1); ++ const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1); ++ const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1; ++ const unsigned int plevel = s->ps.pps->log2_parallel_merge_level; ++ const unsigned int part_mode = lc->cu.part_mode; ++ ++ const HEVCRpiMvField * perm[4]; ++ unsigned int nb_merge_cand = 0; ++ ++ // singleMCLFlag => part_idx == 0 so no need to test for it ++ if ((avail & AVAIL_L) == 0 || ++ (part_idx == 1 && ++ ((parts_a1 >> part_mode) & 1) != 0 || ++ is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) || ++ mvf_a1->pred_flag == PF_INTRA) ++ { ++ mvf_a1 = NULL; ++ } ++ else ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_a1; ++ perm[nb_merge_cand++] = mvf_a1; ++ } ++ ++ if ((avail & AVAIL_U) == 0 || ++ (part_idx == 1 && ++ ((parts_b1 >> part_mode) & 1) != 0 || ++ is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) || ++ mvf_b1->pred_flag == PF_INTRA) ++ { ++ mvf_b1 = NULL; ++ } ++ else if (!mvf_eq(mvf_b1, mvf_a1)) ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_b1; ++ perm[nb_merge_cand++] = mvf_b1; ++ } ++ ++ // above right spatial merge candidate ++ // Never need mvf_b0 again so don't bother zeroing if navail ++ if ((avail & AVAIL_UR) != 0 && ++ !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) && ++ mvf_b0->pred_flag != PF_INTRA && ++ !mvf_eq(mvf_b0, mvf_b1)) ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_b0; ++ perm[nb_merge_cand++] = mvf_b0; ++ } ++ ++ // left bottom spatial merge candidate ++ // Never need mvf_a0 again so don't bother zeroing if navail ++ if ((avail & AVAIL_DL) != 0 && ++ !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) && ++ mvf_a0->pred_flag != PF_INTRA && ++ !mvf_eq(mvf_a0, mvf_a1)) ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_a0; ++ perm[nb_merge_cand++] = mvf_a0; ++ } ++ ++ // above left spatial merge candidate ++ if (nb_merge_cand != 4 && ++ (avail & AVAIL_UL) != 0 && ++ !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0)) ++ { ++ const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL ++ ++ if (mvf_b2->pred_flag != PF_INTRA && ++ !mvf_eq(mvf_b2, mvf_a1) && ++ !mvf_eq(mvf_b2, mvf_b1)) ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_b2; ++ perm[nb_merge_cand++] = mvf_b2; ++ } ++ } ++ ++ // temporal motion vector candidate ++ if (s->sh.slice_temporal_mvp_enabled_flag) ++ { ++ static const HEVCRpiMvField mvf_z = {{0}}; ++ ++ *mvf_t = mvf_z; ++ ++ if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH, ++ 0, mvf_t->xy + 0, 0)) ++ mvf_t->pred_flag = PF_L0; ++ ++ if (s->sh.slice_type == HEVC_SLICE_B && ++ temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH, ++ 0, mvf_t->xy + 1, 1)) ++ mvf_t->pred_flag |= PF_L1; ++ ++ if (mvf_t->pred_flag != 0) ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_t; ++ perm[nb_merge_cand++] = mvf_t; ++ } ++ } ++ ++ // combined bi-predictive merge candidates (applies for B slices) ++ if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1) ++ { ++ unsigned int comb_idx = 0; ++ const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1); ++ const RefPicList * const refPicList = s->refPicList; ++ ++ for (comb_idx = 0; comb_idx < cand_count; comb_idx++) ++ { ++ static const uint8_t l0_l1_cand_idx[12][2] = { ++ { 0, 1, }, ++ { 1, 0, }, ++ { 0, 2, }, ++ { 2, 0, }, ++ { 1, 2, }, ++ { 2, 1, }, ++ { 0, 3, }, ++ { 3, 0, }, ++ { 1, 3, }, ++ { 3, 1, }, ++ { 2, 3, }, ++ { 3, 2, }, ++ }; ++ ++ const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0]; ++ const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1]; ++ const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx]; ++ const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx]; ++ ++ if ((mvf_c0->pred_flag & PF_L0) != 0 && ++ (mvf_c1->pred_flag & PF_L1) != 0 && ++ (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] || ++ mvf_c0->xy[0] != mvf_c1->xy[1])) ++ { ++ if (merge_idx == nb_merge_cand++) ++ { ++ // Need to be a bit careful as we will construct mvf_t and we ++ // may already be using that as one of our condidates ++ // so build & copy rather than build in place ++ const HEVCRpiMvField mvf_m = { ++ .xy = { ++ mvf_c0->xy[0], ++ mvf_c1->xy[1]}, ++ .ref_idx = { ++ mvf_c0->ref_idx[0], ++ mvf_c1->ref_idx[1]}, ++ .pred_flag = PF_BI ++ }; ++ *mvf_t = mvf_m; ++ return mvf_t; ++ } ++ } ++ } ++ } ++ ++ // "append" Zero motion vector candidates ++ { ++ const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ? ++ FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0]; ++ const unsigned int zero_idx = merge_idx - nb_merge_cand; ++ ++ const HEVCRpiMvField mvf_m = { ++ .xy = {0, 0}, ++ .ref_idx = { ++ zero_idx < nb_refs ? zero_idx : 0, ++ (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0}, ++ .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0 ++ }; ++ ++ *mvf_t = mvf_m; ++ return mvf_t; ++ } ++} ++ ++ ++// 8.5.3.1.1 Derivation process of luma Mvs for merge mode ++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW, ++ int nPbH, int log2_cb_size, int part_idx, ++ int merge_idx, HEVCRpiMvField * const mv) ++{ ++ const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ? ++ derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8, ++ ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8), ++ 0, merge_idx, mv) : ++ derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH, ++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH), ++ part_idx, merge_idx, mv); ++ ++ if (mvf_m != mv) ++ *mv = *mvf_m; ++ ++ if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12) ++ mv->pred_flag = PF_L0; ++} ++ ++ ++static av_always_inline const MvXY * ++mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf) ++{ ++ if (mvf != NULL) ++ { ++ if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0) ++ return mvf->xy + pfi0; ++ if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0) ++ return mvf->xy + pfi1; ++ } ++ return NULL; ++} ++ ++static av_always_inline const MvXY * ++mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, ++ const int islt0, const int poc0, const int poc_cur, ++ MvXY * const mv_t, const HEVCRpiMvField * const mvf) ++{ ++ if (mvf != NULL) ++ { ++ if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0) ++ { ++ const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]]; ++ if (islt0 || poc1 == poc0) { ++ return mvf->xy + pfi0; ++ } ++ *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0); ++ return mv_t; ++ } ++ if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0) ++ { ++ const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]]; ++ if (islt0 || poc1 == poc0) { ++ return mvf->xy + pfi1; ++ } ++ *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0); ++ return mv_t; ++ } ++ } ++ return NULL; ++} ++ ++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int nPbW, const unsigned int nPbH, ++ const unsigned int avail, ++ HEVCRpiMvField * const mv, ++ const unsigned int mvp_lx_flag, const unsigned int LX) ++{ ++ const unsigned int pfi0 = LX; ++ const unsigned int pfi1 = LX == 0 ? 1 : 0; ++ const RefPicList * const rpl = s->refPicList; ++ const int poc0 = rpl[LX].list[mv->ref_idx[LX]]; ++ const int poc_cur = s->poc; ++ const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]]; ++ ++ const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1); ++ const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1); ++ const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL ++ const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1); ++ const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1; ++ const MvXY * mva = NULL; ++ const MvXY * mvb; ++ MvXY * const mv_rv = mv->xy + LX; ++ MvXY mvt_a, mvt_b; ++ ++ *mv_rv = 0; ++ ++ if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA) ++ mvf_a0 = NULL; ++ else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0) ++ goto use_mva; ++ ++ if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA) ++ mvf_a1 = NULL; ++ ++ if (mva == NULL && ++ (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL && ++ (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL) ++ mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1); ++ ++ if (mvp_lx_flag == 0 && mva != NULL) ++ goto use_mva; ++ ++ if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA) ++ mvf_b0 = NULL; ++ if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA) ++ mvf_b1 = NULL; ++ if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA) ++ mvf_b2 = NULL; ++ ++ if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL && ++ (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL) ++ mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2); ++ ++ if (mvf_a0 == NULL && mvf_a1 == NULL) { ++ mva = mvb; ++ if (mvp_lx_flag == 0 && mva != NULL) ++ goto use_mva; ++ ++ if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL && ++ (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL) ++ mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2); ++ } ++ ++ if (mva == NULL) { ++ mva = mvb; ++ mvb = NULL; ++ } ++ ++ if (mvb != NULL && *mva == *mvb) // If A == B then ignore B ++ mvb = NULL; ++ ++ if (mvp_lx_flag == 0 && mva != NULL) { ++ goto use_mva; ++ } ++ else if (mvp_lx_flag != 0 && mvb != NULL) { ++ *mv_rv = *mvb; ++ } ++ else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) { ++ temporal_luma_motion_vector(s, lc, x0, y0, nPbW, ++ nPbH, mv->ref_idx[LX], ++ mv_rv, LX); ++ } ++ return; ++ ++use_mva: ++ *mv_rv = *mva; ++ return; ++} ++ +diff --git a/libavcodec/rpi_hevc_parse.c b/libavcodec/rpi_hevc_parse.c +new file mode 100644 +index 0000000000..e58a59ce5e +--- /dev/null ++++ b/libavcodec/rpi_hevc_parse.c +@@ -0,0 +1,143 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "bytestream.h" ++#include "h2645_parse.h" ++#include "hevc.h" ++#include "rpi_hevc_parse.h" ++ ++static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps, ++ HEVCSEIContext *sei, int is_nalff, int nal_length_size, ++ int err_recognition, int apply_defdispwin, void *logctx) ++{ ++ int i; ++ int ret = 0; ++ H2645Packet pkt = { 0 }; ++ ++ ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff, ++ nal_length_size, AV_CODEC_ID_HEVC, 1, 0); ++ if (ret < 0) { ++ goto done; ++ } ++ ++ for (i = 0; i < pkt.nb_nals; i++) { ++ H2645NAL *nal = &pkt.nals[i]; ++ ++ /* ignore everything except parameter sets and VCL NALUs */ ++ switch (nal->type) { ++ case HEVC_NAL_VPS: ++ ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps); ++ if (ret < 0) ++ goto done; ++ break; ++ case HEVC_NAL_SPS: ++ ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin); ++ if (ret < 0) ++ goto done; ++ break; ++ case HEVC_NAL_PPS: ++ ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps); ++ if (ret < 0) ++ goto done; ++ break; ++ case HEVC_NAL_SEI_PREFIX: ++ case HEVC_NAL_SEI_SUFFIX: ++ ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type); ++ if (ret < 0) ++ goto done; ++ break; ++ default: ++ av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type); ++ break; ++ } ++ } ++ ++done: ++ ff_h2645_packet_uninit(&pkt); ++ if (err_recognition & AV_EF_EXPLODE) ++ return ret; ++ ++ return 0; ++} ++ ++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps, ++ HEVCSEIContext *sei, int *is_nalff, int *nal_length_size, ++ int err_recognition, int apply_defdispwin, void *logctx) ++{ ++ int ret = 0; ++ GetByteContext gb; ++ ++ bytestream2_init(&gb, data, size); ++ ++ if (size > 3 && (data[0] || data[1] || data[2] > 1)) { ++ /* It seems the extradata is encoded as hvcC format. ++ * Temporarily, we support configurationVersion==0 until 14496-15 3rd ++ * is finalized. When finalized, configurationVersion will be 1 and we ++ * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */ ++ int i, j, num_arrays, nal_len_size; ++ ++ *is_nalff = 1; ++ ++ bytestream2_skip(&gb, 21); ++ nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1; ++ num_arrays = bytestream2_get_byte(&gb); ++ ++ /* nal units in the hvcC always have length coded with 2 bytes, ++ * so put a fake nal_length_size = 2 while parsing them */ ++ *nal_length_size = 2; ++ ++ /* Decode nal units from hvcC. */ ++ for (i = 0; i < num_arrays; i++) { ++ int type = bytestream2_get_byte(&gb) & 0x3f; ++ int cnt = bytestream2_get_be16(&gb); ++ ++ for (j = 0; j < cnt; j++) { ++ // +2 for the nal size field ++ int nalsize = bytestream2_peek_be16(&gb) + 2; ++ if (bytestream2_get_bytes_left(&gb) < nalsize) { ++ av_log(logctx, AV_LOG_ERROR, ++ "Invalid NAL unit size in extradata.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff, ++ *nal_length_size, err_recognition, apply_defdispwin, ++ logctx); ++ if (ret < 0) { ++ av_log(logctx, AV_LOG_ERROR, ++ "Decoding nal unit %d %d from hvcC failed\n", ++ type, i); ++ return ret; ++ } ++ bytestream2_skip(&gb, nalsize); ++ } ++ } ++ ++ /* Now store right nal length size, that will be used to parse ++ * all other nals */ ++ *nal_length_size = nal_len_size; ++ } else { ++ *is_nalff = 0; ++ ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size, ++ err_recognition, apply_defdispwin, logctx); ++ if (ret < 0) ++ return ret; ++ } ++ ++ return ret; ++} +diff --git a/libavcodec/rpi_hevc_parse.h b/libavcodec/rpi_hevc_parse.h +new file mode 100644 +index 0000000000..4b4d032a16 +--- /dev/null ++++ b/libavcodec/rpi_hevc_parse.h +@@ -0,0 +1,36 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/** ++ * @file ++ * H.265 parser code ++ */ ++ ++#ifndef AVCODEC_RPI_HEVC_PARSE_H ++#define AVCODEC_RPI_HEVC_PARSE_H ++ ++#include ++ ++#include "rpi_hevc_ps.h" ++#include "rpi_hevc_sei.h" ++ ++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps, ++ HEVCSEIContext *sei, int *is_nalff, int *nal_length_size, ++ int err_recognition, int apply_defdispwin, void *logctx); ++ ++#endif /* AVCODEC_RPI_HEVC_PARSE_H */ +diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c +new file mode 100644 +index 0000000000..f4e31f7d1d +--- /dev/null ++++ b/libavcodec/rpi_hevc_ps.c +@@ -0,0 +1,1938 @@ ++/* ++ * HEVC Parameter Set decoding ++ * ++ * Copyright (C) 2012 - 2103 Guillaume Martres ++ * Copyright (C) 2012 - 2103 Mickael Raulet ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2013 Vittorio Giovara ++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/imgutils.h" ++#include "golomb.h" ++#include "rpi_hevc_data.h" ++#include "rpi_hevc_ps.h" ++#include "rpi_hevcdec.h" ++ ++static const uint8_t default_scaling_list_intra[] = { ++ 16, 16, 16, 16, 17, 18, 21, 24, ++ 16, 16, 16, 16, 17, 19, 22, 25, ++ 16, 16, 17, 18, 20, 22, 25, 29, ++ 16, 16, 18, 21, 24, 27, 31, 36, ++ 17, 17, 20, 24, 30, 35, 41, 47, ++ 18, 19, 22, 27, 35, 44, 54, 65, ++ 21, 22, 25, 31, 41, 54, 70, 88, ++ 24, 25, 29, 36, 47, 65, 88, 115 ++}; ++ ++static const uint8_t default_scaling_list_inter[] = { ++ 16, 16, 16, 16, 17, 18, 20, 24, ++ 16, 16, 16, 17, 18, 20, 24, 25, ++ 16, 16, 17, 18, 20, 24, 25, 28, ++ 16, 17, 18, 20, 24, 25, 28, 33, ++ 17, 18, 20, 24, 25, 28, 33, 41, ++ 18, 20, 24, 25, 28, 33, 41, 54, ++ 20, 24, 25, 28, 33, 41, 54, 71, ++ 24, 25, 28, 33, 41, 54, 71, 91 ++}; ++ ++static const AVRational vui_sar[] = { ++ { 0, 1 }, ++ { 1, 1 }, ++ { 12, 11 }, ++ { 10, 11 }, ++ { 16, 11 }, ++ { 40, 33 }, ++ { 24, 11 }, ++ { 20, 11 }, ++ { 32, 11 }, ++ { 80, 33 }, ++ { 18, 11 }, ++ { 15, 11 }, ++ { 64, 33 }, ++ { 160, 99 }, ++ { 4, 3 }, ++ { 3, 2 }, ++ { 2, 1 }, ++}; ++ ++ ++// pps_cb_qp_offset: -12,+12 ++// slice_cb_qp_offset: -12,+12 also ++// "The value of pps_cb_qp_offset + slice_cb_qp_offset shall be in the range of -12 to +12, inclusive." ++// cr_qp_offset_list[n]: -12,+12 ++// So worst case total offset: -24,+24 ++ ++#define T(n) ((((48+(n))/6-10)<<3) | (48+(n))%6) ++#define C(B,n) T(B*6+(n) < 0 ? -B*6 : (n) > 51 ? 51 : (n)) ++#define M(B,n) C(B,(-n)) ++ ++// Sizeof the QP_START_BLOCK ++#define QP_OFFSET_0 (8*6 + 12*2) ++#define QP_START(B) \ ++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ ++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ ++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ ++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ ++\ ++ M(B,48), M(B,47), M(B,46), M(B,45), M(B,44), M(B,43),\ ++ M(B,42), M(B,41), M(B,40), M(B,39), M(B,38), M(B,37),\ ++ M(B,36), M(B,35), M(B,34), M(B,33), M(B,32), M(B,31),\ ++ M(B,30), M(B,29), M(B,28), M(B,27), M(B,26), M(B,25),\ ++ M(B,24), M(B,23), M(B,22), M(B,21), M(B,20), M(B,19),\ ++ M(B,18), M(B,17), M(B,16), M(B,15), M(B,14), M(B,13),\ ++ M(B,12), M(B,11), M(B,10), M(B, 9), M(B, 8), M(B, 7),\ ++ M(B, 6), M(B, 5), M(B, 4), M(B, 3), M(B, 2), M(B, 1) ++#define QP_END(B) \ ++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\ ++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\ ++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51) ++ ++#define T1(B)\ ++{\ ++ QP_START(B),\ ++ C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\ ++ C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\ ++ C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\ ++ C(B,29), C(B,30), C(B,31), C(B,32), C(B,33), C(B,33), C(B,34), C(B,34), C(B,35), C(B,35),\ ++ C(B,36), C(B,36), C(B,37), C(B,37), C(B,38), C(B,39), C(B,40), C(B,41), C(B,42), C(B,43),\ ++ C(B,44), C(B,45),\ ++ C(B,46), C(B,47), C(B,48), C(B,49), C(B,50), C(B,51),\ ++ QP_END(B)\ ++} ++#define T0(B)\ ++{\ ++ QP_START(B),\ ++ C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\ ++ C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\ ++ C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\ ++ C(B,30), C(B,31), C(B,32), C(B,33), C(B,34), C(B,35), C(B,36), C(B,37), C(B,38), C(B,39),\ ++ C(B,40), C(B,41), C(B,42), C(B,43), C(B,44), C(B,45), C(B,46), C(B,47), C(B,48), C(B,49),\ ++ C(B,50), C(B,51),\ ++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\ ++ QP_END(B)\ ++} ++ ++#define QP_TABLE_SIZE (QP_OFFSET_0 + 52 + 12*2) ++ ++static const int8_t qp_c_bd_0[8][QP_TABLE_SIZE] = {T0(0),T0(1),T0(2),T0(3),T0(4),T0(5),T0(6),T0(7)}; ++static const int8_t qp_c_bd_1[8][QP_TABLE_SIZE] = {T1(0),T1(1),T1(2),T1(3),T1(4),T1(5),T1(6),T1(7)}; ++ ++#undef T ++#undef C ++#undef QP_END ++ ++#define C(B,n) ((n)<0?0:(n)>51?51:(n)) ++// We do need a lot of -ve padding to cope with high bit depths that give -ve qps ++#define QP_DBLK_OFFSET_0 QP_OFFSET_0 ++#define QP_END(B)\ ++ 51, 51, 51, 51, 51, 51 ++ ++// These don't need all the padding we have here (12 top/bottom would be enough) ++static const uint8_t qp_c_dblk_0[] = T0(0); ++static const uint8_t qp_c_dblk_1[] = T1(0); ++ ++#undef T ++#undef M ++#undef C ++#undef QP_END ++#undef QP_START ++ ++ ++static void remove_pps(HEVCRpiParamSets * const s, const int id) ++{ ++ if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data) ++ s->pps = NULL; ++ av_buffer_unref(&s->pps_list[id]); ++} ++ ++static void remove_sps(HEVCRpiParamSets * const s, const int id) ++{ ++ int i; ++ if (s->sps_list[id]) { ++ if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data) ++ s->sps = NULL; ++ ++ /* drop all PPS that depend on this SPS */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++) ++ if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id) ++ remove_pps(s, i); ++ ++ av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data)); ++ } ++ av_buffer_unref(&s->sps_list[id]); ++} ++ ++static void remove_vps(HEVCRpiParamSets * const s, const int id) ++{ ++ int i; ++ if (s->vps_list[id]) { ++ if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data) ++ s->vps = NULL; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++) ++ if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id) ++ remove_sps(s, i); ++ } ++ av_buffer_unref(&s->vps_list[id]); ++} ++ ++int ff_hevc_rpi_decode_short_term_rps(GetBitContext * const gb, AVCodecContext * const avctx, ++ ShortTermRPS * const rps, const HEVCRpiSPS * const sps, const int is_slice_header) ++{ ++ uint8_t rps_predict = 0; ++ int delta_poc; ++ int k0 = 0; ++ int k1 = 0; ++ int k = 0; ++ int i; ++ ++ if (rps != sps->st_rps && sps->nb_st_rps) ++ rps_predict = get_bits1(gb); ++ ++ if (rps_predict) { ++ const ShortTermRPS *rps_ridx; ++ int delta_rps; ++ unsigned abs_delta_rps; ++ uint8_t use_delta_flag = 0; ++ uint8_t delta_rps_sign; ++ ++ if (is_slice_header) { ++ unsigned int delta_idx = get_ue_golomb_long(gb) + 1; ++ if (delta_idx > sps->nb_st_rps) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid value of delta_idx in slice header RPS: %d > %d.\n", ++ delta_idx, sps->nb_st_rps); ++ return AVERROR_INVALIDDATA; ++ } ++ rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx]; ++ rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs; ++ } else ++ rps_ridx = &sps->st_rps[rps - sps->st_rps - 1]; ++ ++ delta_rps_sign = get_bits1(gb); ++ abs_delta_rps = get_ue_golomb_long(gb) + 1; ++ if (abs_delta_rps < 1 || abs_delta_rps > 32768) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid value of abs_delta_rps: %d\n", ++ abs_delta_rps); ++ return AVERROR_INVALIDDATA; ++ } ++ delta_rps = (1 - (delta_rps_sign << 1)) * abs_delta_rps; ++ for (i = 0; i <= rps_ridx->num_delta_pocs; i++) { ++ int used = rps->used[k] = get_bits1(gb); ++ ++ if (!used) ++ use_delta_flag = get_bits1(gb); ++ ++ if (used || use_delta_flag) { ++ if (i < rps_ridx->num_delta_pocs) ++ delta_poc = delta_rps + rps_ridx->delta_poc[i]; ++ else ++ delta_poc = delta_rps; ++ rps->delta_poc[k] = delta_poc; ++ if (delta_poc < 0) ++ k0++; ++ else ++ k1++; ++ k++; ++ } ++ } ++ ++ if (k >= FF_ARRAY_ELEMS(rps->used)) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid num_delta_pocs: %d\n", k); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ rps->num_delta_pocs = k; ++ rps->num_negative_pics = k0; ++ // sort in increasing order (smallest first) ++ if (rps->num_delta_pocs != 0) { ++ int used, tmp; ++ for (i = 1; i < rps->num_delta_pocs; i++) { ++ delta_poc = rps->delta_poc[i]; ++ used = rps->used[i]; ++ for (k = i - 1; k >= 0; k--) { ++ tmp = rps->delta_poc[k]; ++ if (delta_poc < tmp) { ++ rps->delta_poc[k + 1] = tmp; ++ rps->used[k + 1] = rps->used[k]; ++ rps->delta_poc[k] = delta_poc; ++ rps->used[k] = used; ++ } ++ } ++ } ++ } ++ if ((rps->num_negative_pics >> 1) != 0) { ++ int used; ++ k = rps->num_negative_pics - 1; ++ // flip the negative values to largest first ++ for (i = 0; i < rps->num_negative_pics >> 1; i++) { ++ delta_poc = rps->delta_poc[i]; ++ used = rps->used[i]; ++ rps->delta_poc[i] = rps->delta_poc[k]; ++ rps->used[i] = rps->used[k]; ++ rps->delta_poc[k] = delta_poc; ++ rps->used[k] = used; ++ k--; ++ } ++ } ++ } else { ++ unsigned int prev, nb_positive_pics; ++ rps->num_negative_pics = get_ue_golomb_long(gb); ++ nb_positive_pics = get_ue_golomb_long(gb); ++ ++ if (rps->num_negative_pics >= HEVC_MAX_REFS || ++ nb_positive_pics >= HEVC_MAX_REFS) { ++ av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics; ++ if (rps->num_delta_pocs) { ++ prev = 0; ++ for (i = 0; i < rps->num_negative_pics; i++) { ++ delta_poc = get_ue_golomb_long(gb) + 1; ++ if (delta_poc < 1 || delta_poc > 32768) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid value of delta_poc: %d\n", ++ delta_poc); ++ return AVERROR_INVALIDDATA; ++ } ++ prev -= delta_poc; ++ rps->delta_poc[i] = prev; ++ rps->used[i] = get_bits1(gb); ++ } ++ prev = 0; ++ for (i = 0; i < nb_positive_pics; i++) { ++ delta_poc = get_ue_golomb_long(gb) + 1; ++ if (delta_poc < 1 || delta_poc > 32768) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid value of delta_poc: %d\n", ++ delta_poc); ++ return AVERROR_INVALIDDATA; ++ } ++ prev += delta_poc; ++ rps->delta_poc[rps->num_negative_pics + i] = prev; ++ rps->used[rps->num_negative_pics + i] = get_bits1(gb); ++ } ++ } ++ } ++ return 0; ++} ++ ++ ++static int decode_profile_tier_level(GetBitContext * const gb, AVCodecContext * const avctx, ++ PTLCommon * const ptl) ++{ ++ int i; ++ ++ if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12) ++ return -1; ++ ++ ptl->profile_space = get_bits(gb, 2); ++ ptl->tier_flag = get_bits1(gb); ++ ptl->profile_idc = get_bits(gb, 5); ++ if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN) ++ av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n"); ++ else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10) ++ av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n"); ++ else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE) ++ av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n"); ++ else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT) ++ av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n"); ++ else ++ av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc); ++ ++ for (i = 0; i < 32; i++) { ++ ptl->profile_compatibility_flag[i] = get_bits1(gb); ++ ++ if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i]) ++ ptl->profile_idc = i; ++ } ++ ptl->progressive_source_flag = get_bits1(gb); ++ ptl->interlaced_source_flag = get_bits1(gb); ++ ptl->non_packed_constraint_flag = get_bits1(gb); ++ ptl->frame_only_constraint_flag = get_bits1(gb); ++ ++ skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15] ++ skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31] ++ skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43] ++ ++ return 0; ++} ++ ++static int parse_ptl(GetBitContext * const gb, AVCodecContext * const avctx, ++ PTL * const ptl, const int max_num_sub_layers) ++{ ++ int i; ++ if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 || ++ get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) { ++ av_log(avctx, AV_LOG_ERROR, "PTL information too short\n"); ++ return -1; ++ } ++ ++ ptl->general_ptl.level_idc = get_bits(gb, 8); ++ ++ for (i = 0; i < max_num_sub_layers - 1; i++) { ++ ptl->sub_layer_profile_present_flag[i] = get_bits1(gb); ++ ptl->sub_layer_level_present_flag[i] = get_bits1(gb); ++ } ++ ++ if (max_num_sub_layers - 1> 0) ++ for (i = max_num_sub_layers - 1; i < 8; i++) ++ skip_bits(gb, 2); // reserved_zero_2bits[i] ++ for (i = 0; i < max_num_sub_layers - 1; i++) { ++ if (ptl->sub_layer_profile_present_flag[i] && ++ decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) { ++ av_log(avctx, AV_LOG_ERROR, ++ "PTL information for sublayer %i too short\n", i); ++ return -1; ++ } ++ if (ptl->sub_layer_level_present_flag[i]) { ++ if (get_bits_left(gb) < 8) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Not enough data for sublayer %i level_idc\n", i); ++ return -1; ++ } else ++ ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8); ++ } ++ } ++ ++ return 0; ++} ++ ++static void decode_sublayer_hrd(GetBitContext * const gb, const unsigned int nb_cpb, ++ const int subpic_params_present) ++{ ++ int i; ++ ++ for (i = 0; i < nb_cpb; i++) { ++ get_ue_golomb_long(gb); // bit_rate_value_minus1 ++ get_ue_golomb_long(gb); // cpb_size_value_minus1 ++ ++ if (subpic_params_present) { ++ get_ue_golomb_long(gb); // cpb_size_du_value_minus1 ++ get_ue_golomb_long(gb); // bit_rate_du_value_minus1 ++ } ++ skip_bits1(gb); // cbr_flag ++ } ++} ++ ++static int decode_hrd(GetBitContext * const gb, const int common_inf_present, ++ const int max_sublayers) ++{ ++ int nal_params_present = 0, vcl_params_present = 0; ++ int subpic_params_present = 0; ++ int i; ++ ++ if (common_inf_present) { ++ nal_params_present = get_bits1(gb); ++ vcl_params_present = get_bits1(gb); ++ ++ if (nal_params_present || vcl_params_present) { ++ subpic_params_present = get_bits1(gb); ++ ++ if (subpic_params_present) { ++ skip_bits(gb, 8); // tick_divisor_minus2 ++ skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1 ++ skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag ++ skip_bits(gb, 5); // dpb_output_delay_du_length_minus1 ++ } ++ ++ skip_bits(gb, 4); // bit_rate_scale ++ skip_bits(gb, 4); // cpb_size_scale ++ ++ if (subpic_params_present) ++ skip_bits(gb, 4); // cpb_size_du_scale ++ ++ skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1 ++ skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1 ++ skip_bits(gb, 5); // dpb_output_delay_length_minus1 ++ } ++ } ++ ++ for (i = 0; i < max_sublayers; i++) { ++ int low_delay = 0; ++ unsigned int nb_cpb = 1; ++ int fixed_rate = get_bits1(gb); ++ ++ if (!fixed_rate) ++ fixed_rate = get_bits1(gb); ++ ++ if (fixed_rate) ++ get_ue_golomb_long(gb); // elemental_duration_in_tc_minus1 ++ else ++ low_delay = get_bits1(gb); ++ ++ if (!low_delay) { ++ nb_cpb = get_ue_golomb_long(gb) + 1; ++ if (nb_cpb < 1 || nb_cpb > 32) { ++ av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ if (nal_params_present) ++ decode_sublayer_hrd(gb, nb_cpb, subpic_params_present); ++ if (vcl_params_present) ++ decode_sublayer_hrd(gb, nb_cpb, subpic_params_present); ++ } ++ return 0; ++} ++ ++int ff_hevc_rpi_decode_nal_vps(GetBitContext * const gb, AVCodecContext * const avctx, ++ HEVCRpiParamSets * const ps) ++{ ++ int i,j; ++ int vps_id = 0; ++ ptrdiff_t nal_size; ++ HEVCRpiVPS *vps; ++ AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps)); ++ ++ if (!vps_buf) ++ return AVERROR(ENOMEM); ++ vps = (HEVCRpiVPS*)vps_buf->data; ++ ++ av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n"); ++ ++ nal_size = gb->buffer_end - gb->buffer; ++ if (nal_size > sizeof(vps->data)) { ++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS " ++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", ++ nal_size, sizeof(vps->data)); ++ vps->data_size = sizeof(vps->data); ++ } else { ++ vps->data_size = nal_size; ++ } ++ memcpy(vps->data, gb->buffer, vps->data_size); ++ ++ vps_id = get_bits(gb, 4); ++ if (vps_id >= HEVC_MAX_VPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id); ++ goto err; ++ } ++ ++ if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits ++ av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n"); ++ goto err; ++ } ++ ++ vps->vps_max_layers = get_bits(gb, 6) + 1; ++ vps->vps_max_sub_layers = get_bits(gb, 3) + 1; ++ vps->vps_temporal_id_nesting_flag = get_bits1(gb); ++ ++ if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits ++ av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n"); ++ goto err; ++ } ++ ++ if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) { ++ av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n", ++ vps->vps_max_sub_layers); ++ goto err; ++ } ++ ++ if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0) ++ goto err; ++ ++ vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb); ++ ++ i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1; ++ for (; i < vps->vps_max_sub_layers; i++) { ++ vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1; ++ vps->vps_num_reorder_pics[i] = get_ue_golomb_long(gb); ++ vps->vps_max_latency_increase[i] = get_ue_golomb_long(gb) - 1; ++ ++ if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) { ++ av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n", ++ vps->vps_max_dec_pic_buffering[i] - 1); ++ goto err; ++ } ++ if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) { ++ av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n", ++ vps->vps_num_reorder_pics[i]); ++ if (avctx->err_recognition & AV_EF_EXPLODE) ++ goto err; ++ } ++ } ++ ++ vps->vps_max_layer_id = get_bits(gb, 6); ++ vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1; ++ if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 || ++ (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) { ++ av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n"); ++ goto err; ++ } ++ ++ for (i = 1; i < vps->vps_num_layer_sets; i++) ++ for (j = 0; j <= vps->vps_max_layer_id; j++) ++ skip_bits(gb, 1); // layer_id_included_flag[i][j] ++ ++ vps->vps_timing_info_present_flag = get_bits1(gb); ++ if (vps->vps_timing_info_present_flag) { ++ vps->vps_num_units_in_tick = get_bits_long(gb, 32); ++ vps->vps_time_scale = get_bits_long(gb, 32); ++ vps->vps_poc_proportional_to_timing_flag = get_bits1(gb); ++ if (vps->vps_poc_proportional_to_timing_flag) ++ vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1; ++ vps->vps_num_hrd_parameters = get_ue_golomb_long(gb); ++ if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) { ++ av_log(avctx, AV_LOG_ERROR, ++ "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters); ++ goto err; ++ } ++ for (i = 0; i < vps->vps_num_hrd_parameters; i++) { ++ int common_inf_present = 1; ++ ++ get_ue_golomb_long(gb); // hrd_layer_set_idx ++ if (i) ++ common_inf_present = get_bits1(gb); ++ decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers); ++ } ++ } ++ get_bits1(gb); /* vps_extension_flag */ ++ ++ if (get_bits_left(gb) < 0) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Overread VPS by %d bits\n", -get_bits_left(gb)); ++ if (ps->vps_list[vps_id]) ++ goto err; ++ } ++ ++ if (ps->vps_list[vps_id] && ++ !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) { ++ av_buffer_unref(&vps_buf); ++ } else { ++ remove_vps(ps, vps_id); ++ ps->vps_list[vps_id] = vps_buf; ++ } ++ ++ return 0; ++ ++err: ++ av_buffer_unref(&vps_buf); ++ return AVERROR_INVALIDDATA; ++} ++ ++static void decode_vui(GetBitContext * const gb, AVCodecContext * const avctx, ++ const int apply_defdispwin, HEVCRpiSPS * const sps) ++{ ++ VUI backup_vui, * const vui = &sps->vui; ++ GetBitContext backup; ++ int sar_present, alt = 0; ++ ++ av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n"); ++ ++ sar_present = get_bits1(gb); ++ if (sar_present) { ++ uint8_t sar_idx = get_bits(gb, 8); ++ if (sar_idx < FF_ARRAY_ELEMS(vui_sar)) ++ vui->sar = vui_sar[sar_idx]; ++ else if (sar_idx == 255) { ++ vui->sar.num = get_bits(gb, 16); ++ vui->sar.den = get_bits(gb, 16); ++ } else ++ av_log(avctx, AV_LOG_WARNING, ++ "Unknown SAR index: %u.\n", sar_idx); ++ } ++ ++ vui->overscan_info_present_flag = get_bits1(gb); ++ if (vui->overscan_info_present_flag) ++ vui->overscan_appropriate_flag = get_bits1(gb); ++ ++ vui->video_signal_type_present_flag = get_bits1(gb); ++ if (vui->video_signal_type_present_flag) { ++ vui->video_format = get_bits(gb, 3); ++ vui->video_full_range_flag = get_bits1(gb); ++ vui->colour_description_present_flag = get_bits1(gb); ++ if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P) ++ sps->pix_fmt = AV_PIX_FMT_YUVJ420P; ++ if (vui->colour_description_present_flag) { ++ vui->colour_primaries = get_bits(gb, 8); ++ vui->transfer_characteristic = get_bits(gb, 8); ++ vui->matrix_coeffs = get_bits(gb, 8); ++ ++ // Set invalid values to "unspecified" ++ if (!av_color_primaries_name(vui->colour_primaries)) ++ vui->colour_primaries = AVCOL_PRI_UNSPECIFIED; ++ if (!av_color_transfer_name(vui->transfer_characteristic)) ++ vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED; ++ if (!av_color_space_name(vui->matrix_coeffs)) ++ vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED; ++ if (vui->matrix_coeffs == AVCOL_SPC_RGB) { ++ switch (sps->pix_fmt) { ++ case AV_PIX_FMT_YUV444P: ++ sps->pix_fmt = AV_PIX_FMT_GBRP; ++ break; ++ case AV_PIX_FMT_YUV444P10: ++ sps->pix_fmt = AV_PIX_FMT_GBRP10; ++ break; ++ case AV_PIX_FMT_YUV444P12: ++ sps->pix_fmt = AV_PIX_FMT_GBRP12; ++ break; ++ } ++ } ++ } ++ } ++ ++ vui->chroma_loc_info_present_flag = get_bits1(gb); ++ if (vui->chroma_loc_info_present_flag) { ++ vui->chroma_sample_loc_type_top_field = get_ue_golomb_long(gb); ++ vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb); ++ } ++ ++ vui->neutra_chroma_indication_flag = get_bits1(gb); ++ vui->field_seq_flag = get_bits1(gb); ++ vui->frame_field_info_present_flag = get_bits1(gb); ++ ++ // Backup context in case an alternate header is detected ++ memcpy(&backup, gb, sizeof(backup)); ++ memcpy(&backup_vui, vui, sizeof(backup_vui)); ++ if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) { ++ vui->default_display_window_flag = 0; ++ av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n"); ++ } else ++ vui->default_display_window_flag = get_bits1(gb); ++ ++ if (vui->default_display_window_flag) { ++ int vert_mult = 1 + (sps->chroma_format_idc < 2); ++ int horiz_mult = 1 + (sps->chroma_format_idc < 3); ++ vui->def_disp_win.left_offset = get_ue_golomb_long(gb) * horiz_mult; ++ vui->def_disp_win.right_offset = get_ue_golomb_long(gb) * horiz_mult; ++ vui->def_disp_win.top_offset = get_ue_golomb_long(gb) * vert_mult; ++ vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult; ++ ++ if (apply_defdispwin && ++ avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) { ++ av_log(avctx, AV_LOG_DEBUG, ++ "discarding vui default display window, " ++ "original values are l:%u r:%u t:%u b:%u\n", ++ vui->def_disp_win.left_offset, ++ vui->def_disp_win.right_offset, ++ vui->def_disp_win.top_offset, ++ vui->def_disp_win.bottom_offset); ++ ++ vui->def_disp_win.left_offset = ++ vui->def_disp_win.right_offset = ++ vui->def_disp_win.top_offset = ++ vui->def_disp_win.bottom_offset = 0; ++ } ++ } ++ ++timing_info: ++ vui->vui_timing_info_present_flag = get_bits1(gb); ++ ++ if (vui->vui_timing_info_present_flag) { ++ if( get_bits_left(gb) < 66 && !alt) { ++ // The alternate syntax seem to have timing info located ++ // at where def_disp_win is normally located ++ av_log(avctx, AV_LOG_WARNING, ++ "Strange VUI timing information, retrying...\n"); ++ memcpy(vui, &backup_vui, sizeof(backup_vui)); ++ memcpy(gb, &backup, sizeof(backup)); ++ alt = 1; ++ goto timing_info; ++ } ++ vui->vui_num_units_in_tick = get_bits_long(gb, 32); ++ vui->vui_time_scale = get_bits_long(gb, 32); ++ if (alt) { ++ av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n", ++ vui->vui_time_scale, vui->vui_num_units_in_tick); ++ } ++ vui->vui_poc_proportional_to_timing_flag = get_bits1(gb); ++ if (vui->vui_poc_proportional_to_timing_flag) ++ vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb); ++ vui->vui_hrd_parameters_present_flag = get_bits1(gb); ++ if (vui->vui_hrd_parameters_present_flag) ++ decode_hrd(gb, 1, sps->max_sub_layers); ++ } ++ ++ vui->bitstream_restriction_flag = get_bits1(gb); ++ if (vui->bitstream_restriction_flag) { ++ if (get_bits_left(gb) < 8 && !alt) { ++ av_log(avctx, AV_LOG_WARNING, ++ "Strange VUI bitstream restriction information, retrying" ++ " from timing information...\n"); ++ memcpy(vui, &backup_vui, sizeof(backup_vui)); ++ memcpy(gb, &backup, sizeof(backup)); ++ alt = 1; ++ goto timing_info; ++ } ++ vui->tiles_fixed_structure_flag = get_bits1(gb); ++ vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb); ++ vui->restricted_ref_pic_lists_flag = get_bits1(gb); ++ vui->min_spatial_segmentation_idc = get_ue_golomb_long(gb); ++ vui->max_bytes_per_pic_denom = get_ue_golomb_long(gb); ++ vui->max_bits_per_min_cu_denom = get_ue_golomb_long(gb); ++ vui->log2_max_mv_length_horizontal = get_ue_golomb_long(gb); ++ vui->log2_max_mv_length_vertical = get_ue_golomb_long(gb); ++ } ++ ++ if (get_bits_left(gb) < 1 && !alt) { ++ // XXX: Alternate syntax when sps_range_extension_flag != 0? ++ av_log(avctx, AV_LOG_WARNING, ++ "Overread in VUI, retrying from timing information...\n"); ++ memcpy(vui, &backup_vui, sizeof(backup_vui)); ++ memcpy(gb, &backup, sizeof(backup)); ++ alt = 1; ++ goto timing_info; ++ } ++} ++ ++static void set_default_scaling_list_data(ScalingList * const sl) ++{ ++ int matrixId; ++ ++ for (matrixId = 0; matrixId < 6; matrixId++) { ++ // 4x4 default is 16 ++ memset(sl->sl[0][matrixId], 16, 16); ++ sl->sl_dc[0][matrixId] = 16; // default for 16x16 ++ sl->sl_dc[1][matrixId] = 16; // default for 32x32 ++ } ++ ++ memcpy(sl->sl[1][0], default_scaling_list_intra, 64); ++ memcpy(sl->sl[1][1], default_scaling_list_intra, 64); ++ memcpy(sl->sl[1][2], default_scaling_list_intra, 64); ++ ++ memcpy(sl->sl[1][3], default_scaling_list_inter, 64); ++ memcpy(sl->sl[1][4], default_scaling_list_inter, 64); ++ memcpy(sl->sl[1][5], default_scaling_list_inter, 64); ++ ++ memcpy(sl->sl[2][0], default_scaling_list_intra, 64); ++ memcpy(sl->sl[2][1], default_scaling_list_intra, 64); ++ memcpy(sl->sl[2][2], default_scaling_list_intra, 64); ++ ++ memcpy(sl->sl[2][3], default_scaling_list_inter, 64); ++ memcpy(sl->sl[2][4], default_scaling_list_inter, 64); ++ memcpy(sl->sl[2][5], default_scaling_list_inter, 64); ++ ++ memcpy(sl->sl[3][0], default_scaling_list_intra, 64); ++ memcpy(sl->sl[3][1], default_scaling_list_intra, 64); ++ memcpy(sl->sl[3][2], default_scaling_list_intra, 64); ++ ++ memcpy(sl->sl[3][3], default_scaling_list_inter, 64); ++ memcpy(sl->sl[3][4], default_scaling_list_inter, 64); ++ memcpy(sl->sl[3][5], default_scaling_list_inter, 64); ++} ++ ++static int scaling_list_data(GetBitContext * const gb, AVCodecContext * const avctx, ScalingList * const sl, ++ const HEVCRpiSPS * const sps) ++{ ++ uint8_t scaling_list_pred_mode_flag; ++ int32_t scaling_list_dc_coef[2][6]; ++ int size_id, matrix_id, pos; ++ int i; ++ ++ for (size_id = 0; size_id < 4; size_id++) ++ for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) { ++ scaling_list_pred_mode_flag = get_bits1(gb); ++ if (!scaling_list_pred_mode_flag) { ++ unsigned int delta = get_ue_golomb_long(gb); ++ /* Only need to handle non-zero delta. Zero means default, ++ * which should already be in the arrays. */ ++ if (delta) { ++ // Copy from previous array. ++ delta *= (size_id == 3) ? 3 : 1; ++ if (matrix_id < delta) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid delta in scaling list data: %d.\n", delta); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ memcpy(sl->sl[size_id][matrix_id], ++ sl->sl[size_id][matrix_id - delta], ++ size_id > 0 ? 64 : 16); ++ if (size_id > 1) ++ sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta]; ++ } ++ } else { ++ int next_coef, coef_num; ++ int32_t scaling_list_delta_coef; ++ ++ next_coef = 8; ++ coef_num = FFMIN(64, 1 << (4 + (size_id << 1))); ++ if (size_id > 1) { ++ scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8; ++ next_coef = scaling_list_dc_coef[size_id - 2][matrix_id]; ++ sl->sl_dc[size_id - 2][matrix_id] = next_coef; ++ } ++ for (i = 0; i < coef_num; i++) { ++ if (size_id == 0) ++ pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] + ++ ff_hevc_rpi_diag_scan4x4_x[i]; ++ else ++ pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] + ++ ff_hevc_rpi_diag_scan8x8_x[i]; ++ ++ scaling_list_delta_coef = get_se_golomb(gb); ++ next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256; ++ sl->sl[size_id][matrix_id][pos] = next_coef; ++ } ++ } ++ } ++ ++ if (sps->chroma_format_idc == 3) { ++ for (i = 0; i < 64; i++) { ++ sl->sl[3][1][i] = sl->sl[2][1][i]; ++ sl->sl[3][2][i] = sl->sl[2][2][i]; ++ sl->sl[3][4][i] = sl->sl[2][4][i]; ++ sl->sl[3][5][i] = sl->sl[2][5][i]; ++ } ++ sl->sl_dc[1][1] = sl->sl_dc[0][1]; ++ sl->sl_dc[1][2] = sl->sl_dc[0][2]; ++ sl->sl_dc[1][4] = sl->sl_dc[0][4]; ++ sl->sl_dc[1][5] = sl->sl_dc[0][5]; ++ } ++ ++ ++ return 0; ++} ++ ++static int map_pixel_format(HEVCRpiSPS * const sps) ++{ ++ const int cfmt = sps->chroma_format_idc; ++ ++ sps->pix_fmt = AV_PIX_FMT_NONE; ++ switch (sps->bit_depth) { ++ case 8: ++ if (cfmt == 1) ++ sps->pix_fmt = AV_PIX_FMT_SAND128; ++ break; ++ case 10: ++ if (cfmt == 1) ++ sps->pix_fmt = AV_PIX_FMT_SAND64_10; ++ break; ++ default: ++ break; ++ } ++ ++ sps->hshift[0] = sps->vshift[0] = 0; ++ sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4 ++ sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2 ++ ++ sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0; ++ ++ return 0; ++} ++ ++static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id, ++ const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx) ++{ ++ HEVCRpiWindow *ow; ++ int ret = 0; ++ int log2_diff_max_min_transform_block_size; ++ int bit_depth_chroma, start, vui_present, sublayer_ordering_info; ++ int i; ++ ++ // Coded parameters ++ ++ sps->vps_id = get_bits(gb, 4); ++ if (sps->vps_id >= HEVC_MAX_VPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (vps_list && !vps_list[sps->vps_id]) { ++ av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n", ++ sps->vps_id); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sps->max_sub_layers = get_bits(gb, 3) + 1; ++ if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) { ++ av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n", ++ sps->max_sub_layers); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sps->temporal_id_nesting_flag = get_bits(gb, 1); ++ ++ if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0) ++ return ret; ++ ++ *sps_id = get_ue_golomb_long(gb); ++ if (*sps_id >= HEVC_MAX_SPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sps->chroma_format_idc = get_ue_golomb_long(gb); ++ if (sps->chroma_format_idc > 3U) { ++ av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (sps->chroma_format_idc == 3) ++ sps->separate_colour_plane_flag = get_bits1(gb); ++ ++ if (sps->separate_colour_plane_flag) ++ sps->chroma_format_idc = 0; ++ ++ sps->width = get_ue_golomb_long(gb); ++ sps->height = get_ue_golomb_long(gb); ++ if ((ret = av_image_check_size(sps->width, ++ sps->height, 0, avctx)) < 0) ++ return ret; ++ ++ if (get_bits1(gb)) { // pic_conformance_flag ++ int vert_mult = 1 + (sps->chroma_format_idc < 2); ++ int horiz_mult = 1 + (sps->chroma_format_idc < 3); ++ sps->pic_conf_win.left_offset = get_ue_golomb_long(gb) * horiz_mult; ++ sps->pic_conf_win.right_offset = get_ue_golomb_long(gb) * horiz_mult; ++ sps->pic_conf_win.top_offset = get_ue_golomb_long(gb) * vert_mult; ++ sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult; ++ ++ if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) { ++ av_log(avctx, AV_LOG_DEBUG, ++ "discarding sps conformance window, " ++ "original values are l:%u r:%u t:%u b:%u\n", ++ sps->pic_conf_win.left_offset, ++ sps->pic_conf_win.right_offset, ++ sps->pic_conf_win.top_offset, ++ sps->pic_conf_win.bottom_offset); ++ ++ sps->pic_conf_win.left_offset = ++ sps->pic_conf_win.right_offset = ++ sps->pic_conf_win.top_offset = ++ sps->pic_conf_win.bottom_offset = 0; ++ } ++ sps->output_window = sps->pic_conf_win; ++ } ++ ++ sps->bit_depth = get_ue_golomb_long(gb) + 8; ++ bit_depth_chroma = get_ue_golomb_long(gb) + 8; ++ if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Luma bit depth (%d) is different from chroma bit depth (%d), " ++ "this is unsupported.\n", ++ sps->bit_depth, bit_depth_chroma); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ ret = map_pixel_format(sps); ++ if (ret < 0) ++ return ret; ++ ++ sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4; ++ if (sps->log2_max_poc_lsb > 16) { ++ av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n", ++ sps->log2_max_poc_lsb - 4); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sublayer_ordering_info = get_bits1(gb); ++ start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1; ++ for (i = start; i < sps->max_sub_layers; i++) { ++ sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1; ++ sps->temporal_layer[i].num_reorder_pics = get_ue_golomb_long(gb); ++ sps->temporal_layer[i].max_latency_increase = get_ue_golomb_long(gb) - 1; ++ if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) { ++ av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n", ++ sps->temporal_layer[i].max_dec_pic_buffering - 1U); ++ return AVERROR_INVALIDDATA; ++ } ++ if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) { ++ av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n", ++ sps->temporal_layer[i].num_reorder_pics); ++ if (avctx->err_recognition & AV_EF_EXPLODE || ++ sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) { ++ return AVERROR_INVALIDDATA; ++ } ++ sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1; ++ } ++ } ++ ++ if (!sublayer_ordering_info) { ++ for (i = 0; i < start; i++) { ++ sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering; ++ sps->temporal_layer[i].num_reorder_pics = sps->temporal_layer[start].num_reorder_pics; ++ sps->temporal_layer[i].max_latency_increase = sps->temporal_layer[start].max_latency_increase; ++ } ++ } ++ ++ sps->log2_min_cb_size = get_ue_golomb_long(gb) + 3; ++ sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb); ++ sps->log2_min_tb_size = get_ue_golomb_long(gb) + 2; ++ log2_diff_max_min_transform_block_size = get_ue_golomb_long(gb); ++ sps->log2_max_trafo_size = log2_diff_max_min_transform_block_size + ++ sps->log2_min_tb_size; ++ ++ if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (sps->log2_diff_max_min_coding_block_size > 30) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ { ++ const unsigned int CtbLog2SizeY = sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size; ++ // Not a bitstream limitation, but all profiles ++ if (CtbLog2SizeY < 4 || CtbLog2SizeY > HEVC_MAX_LOG2_CTB_SIZE) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for CtbLog2SizeY", CtbLog2SizeY); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (sps->log2_max_trafo_size > FFMIN(5, CtbLog2SizeY)) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for MaxTbLog2SizeY", sps->log2_max_trafo_size); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ // Inferred parameters ++ sps->log2_ctb_size = CtbLog2SizeY; ++// sps->log2_min_pu_size = sps->log2_min_cb_size - 1; ++ } ++ ++ sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb); ++ sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb); ++ ++ sps->scaling_list_enable_flag = get_bits1(gb); ++ if (sps->scaling_list_enable_flag) { ++ set_default_scaling_list_data(&sps->scaling_list); ++ ++ if (get_bits1(gb)) { ++ ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps); ++ if (ret < 0) ++ return ret; ++ } ++ } ++ ++ sps->amp_enabled_flag = get_bits1(gb); ++ sps->sao_enabled = get_bits1(gb); ++ ++ // Set pcm defaults (0) so we don't have to test _enabled when we ++ // want to use them ++ memset(&sps->pcm, 0, sizeof(sps->pcm)); ++ ++ if (get_bits1(gb)) // pcm_enabled_flag ++ { ++ const unsigned int limit_max_pcm = FFMIN(5, ++ sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size); ++ sps->pcm.bit_depth = get_bits(gb, 4) + 1; ++ sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1; ++ sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3; ++ sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size + ++ get_ue_golomb_long(gb); ++ if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) { ++ av_log(avctx, AV_LOG_ERROR, ++ "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n", ++ sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth); ++ return AVERROR_INVALIDDATA; ++ } ++ if (sps->pcm.log2_min_pcm_cb_size < sps->log2_min_cb_size || ++ sps->pcm.log2_max_pcm_cb_size > limit_max_pcm) { ++ av_log(avctx, AV_LOG_ERROR, "Bad PCM CB min/max size (%d->%d)", ++ sps->pcm.log2_min_pcm_cb_size, sps->pcm.log2_max_pcm_cb_size); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sps->pcm.loop_filter_disable_flag = get_bits1(gb); ++ } ++ ++ // Could be based on min_pcm_cb_size but much easier logic if we just stick ++ // with 8 (and costs us little) ++ sps->pcm_width = (sps->width + 63) >> 6; // 8 for min size, 8 bits per byte - round up ++ sps->pcm_height = (sps->height + 7) >> 3; ++ ++ sps->nb_st_rps = get_ue_golomb_long(gb); ++ if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) { ++ av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n", ++ sps->nb_st_rps); ++ return AVERROR_INVALIDDATA; ++ } ++ for (i = 0; i < sps->nb_st_rps; i++) { ++ if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i], ++ sps, 0)) < 0) ++ return ret; ++ } ++ ++ sps->long_term_ref_pics_present_flag = get_bits1(gb); ++ if (sps->long_term_ref_pics_present_flag) { ++ sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb); ++ if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) { ++ av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n", ++ sps->num_long_term_ref_pics_sps); ++ return AVERROR_INVALIDDATA; ++ } ++ for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) { ++ sps->lt_ref_pic_poc_lsb_sps[i] = get_bits(gb, sps->log2_max_poc_lsb); ++ sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb); ++ } ++ } ++ ++ sps->sps_temporal_mvp_enabled_flag = get_bits1(gb); ++ sps->intra_filters_disable = get_bits1(gb) ? 0 : FILTER_STRONG; // sps->sps_strong_intra_smoothing_enable_flag ++ sps->vui.sar = (AVRational){0, 1}; ++ vui_present = get_bits1(gb); ++ if (vui_present) ++ decode_vui(gb, avctx, apply_defdispwin, sps); ++ ++ if (get_bits1(gb)) { // sps_extension_flag ++ int sps_extension_flag[1]; ++ for (i = 0; i < 1; i++) ++ sps_extension_flag[i] = get_bits1(gb); ++ skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7); ++ if (sps_extension_flag[0]) { ++ int extended_precision_processing_flag; ++ int cabac_bypass_alignment_enabled_flag; ++ ++ sps->transform_skip_rotation_enabled_flag = get_bits1(gb); ++ sps->transform_skip_context_enabled_flag = get_bits1(gb); ++ sps->implicit_rdpcm_enabled_flag = get_bits1(gb); ++ ++ sps->explicit_rdpcm_enabled_flag = get_bits1(gb); ++ ++ extended_precision_processing_flag = get_bits1(gb); ++ if (extended_precision_processing_flag) ++ av_log(avctx, AV_LOG_WARNING, ++ "extended_precision_processing_flag not yet implemented\n"); ++ ++ if (get_bits1(gb)) // sps->intra_smoothing_disabled_flag ++ sps->intra_filters_disable |= FILTER_EITHER; ++ sps->high_precision_offsets_enabled_flag = get_bits1(gb); ++ sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb); ++ ++ cabac_bypass_alignment_enabled_flag = get_bits1(gb); ++ if (cabac_bypass_alignment_enabled_flag) ++ av_log(avctx, AV_LOG_WARNING, ++ "cabac_bypass_alignment_enabled_flag not yet implemented\n"); ++ } ++ } ++ if (apply_defdispwin) { ++ sps->output_window.left_offset += sps->vui.def_disp_win.left_offset; ++ sps->output_window.right_offset += sps->vui.def_disp_win.right_offset; ++ sps->output_window.top_offset += sps->vui.def_disp_win.top_offset; ++ sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset; ++ } ++ ++ ow = &sps->output_window; ++ if (ow->left_offset >= INT_MAX - ow->right_offset || ++ ow->top_offset >= INT_MAX - ow->bottom_offset || ++ ow->left_offset + ow->right_offset >= sps->width || ++ ow->top_offset + ow->bottom_offset >= sps->height) { ++ av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n", ++ ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset); ++ if (avctx->err_recognition & AV_EF_EXPLODE) { ++ return AVERROR_INVALIDDATA; ++ } ++ av_log(avctx, AV_LOG_WARNING, ++ "Displaying the whole video surface.\n"); ++ memset(ow, 0, sizeof(*ow)); ++ memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win)); ++ } ++ ++ // Inferred parameters ++ ++ sps->ctb_width = (sps->width + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size; ++ sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size; ++ sps->ctb_size = sps->ctb_width * sps->ctb_height; ++ ++ sps->min_cb_width = sps->width >> sps->log2_min_cb_size; ++ sps->min_cb_height = sps->height >> sps->log2_min_cb_size; ++ sps->min_tb_width = sps->width >> sps->log2_min_tb_size; ++ sps->min_tb_height = sps->height >> sps->log2_min_tb_size; ++ sps->min_pu_width = sps->width >> LOG2_MIN_PU_SIZE; ++ sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE; ++ sps->tb_mask = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1; ++ ++ sps->qp_bd_offset = 6 * (sps->bit_depth - 8); ++ sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7)); ++ ++ if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) || ++ av_mod_uintp2(sps->height, sps->log2_min_cb_size)) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) { ++ av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n", ++ sps->max_transform_hierarchy_depth_inter); ++ return AVERROR_INVALIDDATA; ++ } ++ if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) { ++ av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n", ++ sps->max_transform_hierarchy_depth_intra); ++ return AVERROR_INVALIDDATA; ++ } ++ if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) { ++ av_log(avctx, AV_LOG_ERROR, ++ "max transform block size out of range: %d\n", ++ sps->log2_max_trafo_size); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (get_bits_left(gb) < 0) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Overread SPS by %d bits\n", -get_bits_left(gb)); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ return 0; ++} ++ ++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiParamSets *ps, int apply_defdispwin) ++{ ++ HEVCRpiSPS *sps; ++ AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps)); ++ unsigned int sps_id; ++ int ret; ++ ptrdiff_t nal_size; ++ ++ if (!sps_buf) ++ return AVERROR(ENOMEM); ++ sps = (HEVCRpiSPS*)sps_buf->data; ++ ++ av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n"); ++ ++ nal_size = gb->buffer_end - gb->buffer; ++ if (nal_size > sizeof(sps->data)) { ++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS " ++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", ++ nal_size, sizeof(sps->data)); ++ sps->data_size = sizeof(sps->data); ++ } else { ++ sps->data_size = nal_size; ++ } ++ memcpy(sps->data, gb->buffer, sps->data_size); ++ ++ ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id, ++ apply_defdispwin, ++ ps->vps_list, avctx); ++ if (ret < 0) { ++ av_buffer_unref(&sps_buf); ++ return ret; ++ } ++ ++ if (avctx->debug & FF_DEBUG_BITSTREAM) { ++ av_log(avctx, AV_LOG_DEBUG, ++ "Parsed SPS: id %d; coded wxh: %dx%d; " ++ "cropped wxh: %dx%d; pix_fmt: %s.\n", ++ sps_id, sps->width, sps->height, ++ sps->width - (sps->output_window.left_offset + sps->output_window.right_offset), ++ sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset), ++ av_get_pix_fmt_name(sps->pix_fmt)); ++ } ++ ++ /* check if this is a repeat of an already parsed SPS, then keep the ++ * original one. ++ * otherwise drop all PPSes that depend on it */ ++ if (ps->sps_list[sps_id] && ++ !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) { ++ av_buffer_unref(&sps_buf); ++ } else { ++ remove_sps(ps, sps_id); ++ ps->sps_list[sps_id] = sps_buf; ++ } ++ ++ return 0; ++} ++ ++static void hevc_pps_free(void *opaque, uint8_t *data) ++{ ++ HEVCRpiPPS *pps = (HEVCRpiPPS*)data; ++ ++ av_freep(&pps->column_width); ++ av_freep(&pps->row_height); ++ av_freep(&pps->col_bd); ++ av_freep(&pps->row_bd); ++ av_freep(&pps->col_idxX); ++ av_freep(&pps->ctb_addr_rs_to_ts); ++ av_freep(&pps->ctb_addr_ts_to_rs); ++ av_freep(&pps->tile_pos_ts); ++ av_freep(&pps->tile_size); ++ av_freep(&pps->tile_id); ++ av_freep(&pps->ctb_ts_flags); ++ ++ av_freep(&pps); ++} ++ ++static int get_offset_list(GetBitContext * const gb, AVCodecContext * const avctx, unsigned int n_minus_1, int8_t * offsets) ++{ ++ do ++ { ++ const int offset = get_se_golomb_long(gb); ++ if (offset < -12 || offset > 12) { ++ av_log(avctx, AV_LOG_ERROR, "qp_offset_list[]: %d out of range\n", offset); ++ return AVERROR_INVALIDDATA; ++ } ++ *offsets++ = offset; ++ } while (n_minus_1-- != 0); ++ return 0; ++} ++ ++static int pps_range_extensions(GetBitContext * const gb, AVCodecContext * const avctx, ++ HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps) ++{ ++ if (pps->transform_skip_enabled_flag) { ++ pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2; ++ } ++ pps->cross_component_prediction_enabled_flag = get_bits1(gb); ++ if (pps->cross_component_prediction_enabled_flag && ++ (sps->chroma_format_idc != 3 || sps->separate_colour_plane_flag)) ++ { ++ av_log(avctx, AV_LOG_ERROR, "cross_component_prediction_enabled but chroma_format_idc != 3\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb); ++ if (pps->chroma_qp_offset_list_enabled_flag) { ++ int err; ++ ++ pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb); ++ pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb); ++ if (pps->chroma_qp_offset_list_len_minus1 > 5) { ++ av_log(avctx, AV_LOG_ERROR, ++ "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ av_log(avctx, AV_LOG_WARNING, "cb_qp_offset_list not tested yet.\n"); ++ ++ if ((err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cb_qp_offset_list)) != 0 || ++ (err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cr_qp_offset_list)) != 0) ++ return err; ++ } ++ ++ { ++ const unsigned int max_offset = sps->bit_depth > 10 ? sps->bit_depth - 10 : 0; ++ ++ pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb); ++ if (pps->log2_sao_offset_scale_luma > max_offset) { ++ av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_luma invalid"); ++ return AVERROR_INVALIDDATA; ++ } ++ pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb); ++ if (pps->log2_sao_offset_scale_chroma > max_offset) { ++ av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_chroma invalid"); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ return(0); ++} ++ ++static inline int setup_pps(AVCodecContext * const avctx, ++ HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps) ++{ ++ int pic_area_in_ctbs; ++ int i, j, x, y, ctb_addr_rs, tile_id; ++ ++ // Inferred parameters ++ ++ // qp_y -> qp_u/qp_v tables ++ // The tables have at least -24,+24 overrun after adding offset here ++ // which should allow for clipless offseting ++ ++ pps->qp_dblk_x[0] = qp_c_dblk_0 + QP_DBLK_OFFSET_0; // No offset for luma, but may be useful for general code ++ pps->qp_bd_x[0] = qp_c_bd_0[sps->bit_depth - 8] + QP_OFFSET_0; ++ ++ if (sps->chroma_format_idc == 1) { ++ pps->qp_dblk_x[1] = qp_c_dblk_1 + pps->cb_qp_offset + QP_DBLK_OFFSET_0; ++ pps->qp_bd_x[1] = qp_c_bd_1[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0; ++ pps->qp_dblk_x[2] = qp_c_dblk_1 + pps->cr_qp_offset + QP_DBLK_OFFSET_0; ++ pps->qp_bd_x[2] = qp_c_bd_1[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0; ++ } ++ else ++ { ++ pps->qp_dblk_x[1] = qp_c_dblk_0 + pps->cb_qp_offset + QP_DBLK_OFFSET_0; ++ pps->qp_bd_x[1] = qp_c_bd_0[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0; ++ pps->qp_dblk_x[2] = qp_c_dblk_0 + pps->cr_qp_offset + QP_DBLK_OFFSET_0; ++ pps->qp_bd_x[2] = qp_c_bd_0[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0; ++ } ++ ++ pps->col_bd = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd)); ++ pps->row_bd = av_malloc_array(pps->num_tile_rows + 1, sizeof(*pps->row_bd)); ++ pps->col_idxX = av_malloc_array(sps->ctb_width, sizeof(*pps->col_idxX)); ++ if (!pps->col_bd || !pps->row_bd || !pps->col_idxX) ++ return AVERROR(ENOMEM); ++ ++ if (pps->uniform_spacing_flag) { ++ if (!pps->column_width) { ++ pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width)); ++ pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height)); ++ } ++ if (!pps->column_width || !pps->row_height) ++ return AVERROR(ENOMEM); ++ ++ for (i = 0; i < pps->num_tile_columns; i++) { ++ pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns - ++ (i * sps->ctb_width) / pps->num_tile_columns; ++ } ++ ++ for (i = 0; i < pps->num_tile_rows; i++) { ++ pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows - ++ (i * sps->ctb_height) / pps->num_tile_rows; ++ } ++ } ++ ++ { ++ const unsigned int td_mask = 63 >> (sps->log2_ctb_size + sps->pixel_shift); ++ pps->col_bd[0] = 0; ++ pps->tile_wpp_inter_disable = 0; ++ for (i = 0; i < pps->num_tile_columns; i++) ++ { ++ pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i]; ++ ++ // Avoid trying tile parallel if the columns don't fall on cache boundries ++ // (this causes too much pain syncing flushes with the QPU) ++ // Ignore the final (RHS of pic) tile boundry ++ if ((pps->col_bd[i] & td_mask) != 0) { ++ pps->tile_wpp_inter_disable = 1; ++ } ++ } ++ ++ // If we can start the next row before finishing the first line of ++ // this one then we must wait at the end of the tile ++ // * if this happens a lot then there are better but more complicated ++ // conditions that we could apply ++ if (pps->tile_wpp_inter_disable) { ++ for (i = 0; i < pps->num_tile_rows; i++) ++ { ++ if (pps->row_height[i] <= RPI_MAX_JOBS) { ++ pps->tile_wpp_inter_disable = 2; ++ break; ++ } ++ } ++ } ++ } ++ ++ pps->row_bd[0] = 0; ++ for (i = 0; i < pps->num_tile_rows; i++) ++ pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i]; ++ ++ for (i = 0, j = 0; i < sps->ctb_width; i++) { ++ if (i >= pps->col_bd[j + 1]) ++ j++; ++ pps->col_idxX[i] = j; ++ } ++ ++ /** ++ * 6.5 ++ */ ++ pic_area_in_ctbs = sps->ctb_size; ++ ++ pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_rs_to_ts)); ++ pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_ts_to_rs)); ++ pps->tile_id = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->tile_id)); ++ pps->tile_size = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size)); ++ pps->tile_pos_ts = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts)); ++ pps->ctb_ts_flags = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_ts_flags)); ++ if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs || ++ !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) { ++ return AVERROR(ENOMEM); ++ } ++ ++ memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags)); ++ ++ for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) { ++ int tb_x = ctb_addr_rs % sps->ctb_width; ++ int tb_y = ctb_addr_rs / sps->ctb_width; ++ int tile_x = 0; ++ int tile_y = 0; ++ int val = 0; ++ ++ for (i = 0; i < pps->num_tile_columns; i++) { ++ if (tb_x < pps->col_bd[i + 1]) { ++ tile_x = i; ++ break; ++ } ++ } ++ ++ for (i = 0; i < pps->num_tile_rows; i++) { ++ if (tb_y < pps->row_bd[i + 1]) { ++ tile_y = i; ++ break; ++ } ++ } ++ ++ for (i = 0; i < tile_x; i++) ++ val += pps->row_height[tile_y] * pps->column_width[i]; ++ for (i = 0; i < tile_y; i++) ++ val += sps->ctb_width * pps->row_height[i]; ++ ++ val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] + ++ tb_x - pps->col_bd[tile_x]; ++ ++ pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val; ++ pps->ctb_addr_ts_to_rs[val] = ctb_addr_rs; ++ } ++ ++ { ++ uint8_t * pflags = pps->ctb_ts_flags; ++ uint16_t * ptid = pps->tile_id; ++ ++ for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++) ++ { ++ for (i = 0; i < pps->num_tile_columns; i++, tile_id++) ++ { ++ const unsigned int tile_w = pps->column_width[i]; ++ ++ pflags[0] |= CTB_TS_FLAGS_CIREQ; ++ ++ for (x = 0; x != tile_w; ++x) { ++ pflags[x] |= CTB_TS_FLAGS_TOT; ++ } ++ ++ for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++) ++ { ++ pflags[0] |= CTB_TS_FLAGS_SOTL; ++ ++ if (pps->entropy_coding_sync_enabled_flag) ++ { ++ if (pps->column_width[i] != 1) ++ pflags[1] |= CTB_TS_FLAGS_CSAVE; ++ else ++ pflags[0] |= CTB_TS_FLAGS_CIREQ; ++ ++ if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0) ++ pflags[0] |= CTB_TS_FLAGS_CLOAD; ++ } ++ ++ for (x = 0; x != tile_w; ++x) ++ *ptid++ = tile_id; ++ ++ pflags += tile_w; ++ pflags[-1] |= CTB_TS_FLAGS_EOTL; ++ if (i + 1 == pps->num_tile_columns) ++ pflags[-1] |= CTB_TS_FLAGS_EOL; ++ } ++ ++ pflags[-1] |= CTB_TS_FLAGS_EOT; ++ } ++ } ++ } ++ ++ { ++ unsigned int ts = 0; ++ for (j = 0; j < pps->num_tile_rows; j++) ++ for (i = 0; i < pps->num_tile_columns; i++) ++ { ++ const unsigned int size = pps->column_width[i] * pps->row_height[j]; ++ pps->tile_size[j * pps->num_tile_columns + i] = size; ++ pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts; ++ ts += size; ++ } ++ } ++ ++ return 0; ++} ++ ++int ff_hevc_rpi_decode_nal_pps(GetBitContext * const gb, AVCodecContext * const avctx, ++ HEVCRpiParamSets * const ps) ++{ ++ const HEVCRpiSPS *sps = NULL; ++ int i, ret = 0; ++ unsigned int pps_id = 0; ++ ptrdiff_t nal_size; ++ unsigned log2_parallel_merge_level_minus2; ++ ++ AVBufferRef *pps_buf; ++ HEVCRpiPPS *pps = av_mallocz(sizeof(*pps)); ++ ++ if (!pps) ++ return AVERROR(ENOMEM); ++ ++ pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps), ++ hevc_pps_free, NULL, 0); ++ if (!pps_buf) { ++ av_freep(&pps); ++ return AVERROR(ENOMEM); ++ } ++ ++ av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n"); ++ ++ nal_size = gb->buffer_end - gb->buffer; ++ if (nal_size > sizeof(pps->data)) { ++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS " ++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", ++ nal_size, sizeof(pps->data)); ++ pps->data_size = sizeof(pps->data); ++ } else { ++ pps->data_size = nal_size; ++ } ++ memcpy(pps->data, gb->buffer, pps->data_size); ++ ++ // Default values ++ pps->loop_filter_across_tiles_enabled_flag = 1; ++ pps->num_tile_columns = 1; ++ pps->num_tile_rows = 1; ++ pps->uniform_spacing_flag = 1; ++ pps->disable_dbf = 0; ++ pps->beta_offset = 0; ++ pps->tc_offset = 0; ++ pps->log2_max_transform_skip_block_size = 2; ++ ++ // Coded parameters ++ pps_id = get_ue_golomb_long(gb); ++ if (pps_id >= HEVC_MAX_PPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->sps_id = get_ue_golomb_long(gb); ++ if (pps->sps_id >= HEVC_MAX_SPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ if (!ps->sps_list[pps->sps_id]) { ++ av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data; ++ ++ pps->dependent_slice_segments_enabled_flag = get_bits1(gb); ++ pps->output_flag_present_flag = get_bits1(gb); ++ pps->num_extra_slice_header_bits = get_bits(gb, 3); ++ ++ pps->sign_data_hiding_flag = get_bits1(gb); ++ ++ pps->cabac_init_present_flag = get_bits1(gb); ++ ++ pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1; ++ if (pps->num_ref_idx_l0_default_active < 1 || pps->num_ref_idx_l0_default_active > 15) { ++ av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l0_default_active invalid\n"); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1; ++ if (pps->num_ref_idx_l1_default_active < 1 || pps->num_ref_idx_l1_default_active > 15) { ++ av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l1_default_active invalid\n"); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ ++ pps->pic_init_qp_minus26 = get_se_golomb(gb); ++ if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) { ++ av_log(avctx, AV_LOG_ERROR, ++ "init_qp_minus26 %d is outside the valid range " ++ "[%d, %d].\n", ++ pps->pic_init_qp_minus26, ++ -(26 + sps->qp_bd_offset), 25); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ ++ pps->constrained_intra_pred_flag = get_bits1(gb); ++ pps->transform_skip_enabled_flag = get_bits1(gb); ++ ++ pps->cu_qp_delta_enabled_flag = get_bits1(gb); ++ pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size; ++ if (pps->cu_qp_delta_enabled_flag) ++ { ++ const unsigned int diff_cu_qp_delta_depth = get_ue_golomb_long(gb); ++ ++ if (diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) { ++ av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n", ++ diff_cu_qp_delta_depth); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ ++ pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size - diff_cu_qp_delta_depth; ++ } ++ ++ pps->cb_qp_offset = get_se_golomb(gb); ++ if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) { ++ av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n", ++ pps->cb_qp_offset); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->cr_qp_offset = get_se_golomb(gb); ++ if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) { ++ av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n", ++ pps->cr_qp_offset); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb); ++ ++ pps->weighted_pred_flag = get_bits1(gb); ++ pps->weighted_bipred_flag = get_bits1(gb); ++ ++ pps->transquant_bypass_enable_flag = get_bits1(gb); ++ pps->tiles_enabled_flag = get_bits1(gb); ++ pps->entropy_coding_sync_enabled_flag = get_bits1(gb); ++ ++ if (pps->tiles_enabled_flag) { ++ pps->num_tile_columns = get_ue_golomb_long(gb) + 1; ++ pps->num_tile_rows = get_ue_golomb_long(gb) + 1; ++ if (pps->num_tile_columns <= 0 || ++ pps->num_tile_columns >= sps->width) { ++ av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n", ++ pps->num_tile_columns - 1); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ if (pps->num_tile_rows <= 0 || ++ pps->num_tile_rows >= sps->height) { ++ av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n", ++ pps->num_tile_rows - 1); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ ++ pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width)); ++ pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height)); ++ if (!pps->column_width || !pps->row_height) { ++ ret = AVERROR(ENOMEM); ++ goto err; ++ } ++ ++ pps->uniform_spacing_flag = get_bits1(gb); ++ if (!pps->uniform_spacing_flag) { ++ uint64_t sum = 0; ++ for (i = 0; i < pps->num_tile_columns - 1; i++) { ++ pps->column_width[i] = get_ue_golomb_long(gb) + 1; ++ sum += pps->column_width[i]; ++ } ++ if (sum >= sps->ctb_width) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n"); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum; ++ ++ sum = 0; ++ for (i = 0; i < pps->num_tile_rows - 1; i++) { ++ pps->row_height[i] = get_ue_golomb_long(gb) + 1; ++ sum += pps->row_height[i]; ++ } ++ if (sum >= sps->ctb_height) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n"); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum; ++ } ++ pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb); ++ } ++ ++ pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb); ++ ++ pps->deblocking_filter_control_present_flag = get_bits1(gb); ++ if (pps->deblocking_filter_control_present_flag) { ++ pps->deblocking_filter_override_enabled_flag = get_bits1(gb); ++ pps->disable_dbf = get_bits1(gb); ++ if (!pps->disable_dbf) { ++ int beta_offset_div2 = get_se_golomb(gb); ++ int tc_offset_div2 = get_se_golomb(gb) ; ++ if (beta_offset_div2 < -6 || beta_offset_div2 > 6) { ++ av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n", ++ beta_offset_div2); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ if (tc_offset_div2 < -6 || tc_offset_div2 > 6) { ++ av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n", ++ tc_offset_div2); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->beta_offset = 2 * beta_offset_div2; ++ pps->tc_offset = 2 * tc_offset_div2; ++ } ++ } ++ ++ pps->scaling_list_data_present_flag = get_bits1(gb); ++ if (pps->scaling_list_data_present_flag) { ++ set_default_scaling_list_data(&pps->scaling_list); ++ ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps); ++ if (ret < 0) ++ goto err; ++ } ++ pps->lists_modification_present_flag = get_bits1(gb); ++ log2_parallel_merge_level_minus2 = get_ue_golomb_long(gb); ++ if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) { ++ av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n", ++ log2_parallel_merge_level_minus2); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->log2_parallel_merge_level = log2_parallel_merge_level_minus2 + 2; ++ ++ pps->slice_header_extension_present_flag = get_bits1(gb); ++ ++ if (get_bits1(gb)) { // pps_extension_present_flag ++ int pps_range_extensions_flag = get_bits1(gb); ++ skip_bits(gb, 7); // pps_extension_7bits ++ if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) { ++ if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0) ++ goto err; ++ } ++ } ++ ++ ret = setup_pps(avctx, pps, sps); ++ if (ret < 0) ++ goto err; ++ ++ if (get_bits_left(gb) < 0) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Overread PPS by %d bits\n", -get_bits_left(gb)); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ ++ remove_pps(ps, pps_id); ++ ps->pps_list[pps_id] = pps_buf; ++ ++ return 0; ++ ++err: ++ av_buffer_unref(&pps_buf); ++ return ret; ++} ++ ++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type) ++{ ++ int max_poc_lsb = 1 << sps->log2_max_poc_lsb; ++ int prev_poc_lsb = pocTid0 % max_poc_lsb; ++ int prev_poc_msb = pocTid0 - prev_poc_lsb; ++ int poc_msb; ++ ++ if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2) ++ poc_msb = prev_poc_msb + max_poc_lsb; ++ else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2) ++ poc_msb = prev_poc_msb - max_poc_lsb; ++ else ++ poc_msb = prev_poc_msb; ++ ++ // For BLA picture types, POCmsb is set to 0. ++ if (nal_unit_type == HEVC_NAL_BLA_W_LP || ++ nal_unit_type == HEVC_NAL_BLA_W_RADL || ++ nal_unit_type == HEVC_NAL_BLA_N_LP) ++ poc_msb = 0; ++ ++ return poc_msb + poc_lsb; ++} +diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h +new file mode 100644 +index 0000000000..c725ebb9ca +--- /dev/null ++++ b/libavcodec/rpi_hevc_ps.h +@@ -0,0 +1,449 @@ ++/* ++ * HEVC parameter set parsing ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVC_PS_H ++#define AVCODEC_RPI_HEVC_PS_H ++ ++#include ++ ++#include "libavutil/buffer.h" ++#include "libavutil/pixfmt.h" ++#include "libavutil/rational.h" ++ ++#include "avcodec.h" ++#include "get_bits.h" ++#include "hevc.h" ++ ++typedef struct ShortTermRPS { ++ unsigned int num_negative_pics; ++ int num_delta_pocs; ++ int rps_idx_num_delta_pocs; ++ int32_t delta_poc[32]; ++ uint8_t used[32]; ++} ShortTermRPS; ++ ++typedef struct LongTermRPS { ++ int poc[32]; ++ uint8_t used[32]; ++ uint8_t nb_refs; ++} LongTermRPS; ++ ++typedef struct RpiSliceHeader { ++ unsigned int pps_id; ++ ++ ///< address (in raster order) of the first block in the current slice segment ++ unsigned int slice_segment_addr; ++ ///< address (in raster order) of the first block in the current slice ++ unsigned int slice_addr; ++ ++ enum HEVCSliceType slice_type; ++ ++ int pic_order_cnt_lsb; ++ ++ uint8_t first_slice_in_pic_flag; ++ uint8_t dependent_slice_segment_flag; ++ uint8_t pic_output_flag; ++ uint8_t colour_plane_id; ++ ++ ///< RPS coded in the slice header itself is stored here ++ int short_term_ref_pic_set_sps_flag; ++ int short_term_ref_pic_set_size; ++ ShortTermRPS slice_rps; ++ const ShortTermRPS *short_term_rps; ++ int long_term_ref_pic_set_size; ++ LongTermRPS long_term_rps; ++ unsigned int list_entry_lx[2][32]; ++ ++ uint8_t rpl_modification_flag[2]; ++ uint8_t no_output_of_prior_pics_flag; ++ uint8_t slice_temporal_mvp_enabled_flag; ++ ++ unsigned int nb_refs[2]; ++ ++ uint8_t slice_sample_adaptive_offset_flag[3]; ++ uint8_t mvd_l1_zero_flag; ++ ++ uint8_t cabac_init_flag; ++ uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag ++ uint8_t slice_loop_filter_across_slices_enabled_flag; ++ uint8_t collocated_list; ++ ++ uint8_t no_dblk_boundary_flags; ++ ++ unsigned int collocated_ref_idx; ++ ++ int slice_qp_delta; ++ int slice_cb_qp_offset; // -12, +12 ++ int slice_cr_qp_offset; // -12, +12 ++ ++ uint8_t cu_chroma_qp_offset_enabled_flag; ++ ++ int beta_offset; ///< beta_offset_div2 * 2 ++ int tc_offset; ///< tc_offset_div2 * 2 ++ ++ unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand ++ ++ unsigned *entry_point_offset; ++ int * offset; ++ int * size; ++ int num_entry_point_offsets; ++ int offsets_allocated; ++ ++ uint8_t offload_wpp; ++ uint8_t offload_tiles; ++ ++ int8_t slice_qp; ++ ++ uint8_t luma_log2_weight_denom; ++ uint8_t chroma_log2_weight_denom; ++ ++ int16_t luma_weight_l0[16]; // -128, +255 ++ int16_t luma_offset_l0[16]; ++ int16_t chroma_weight_l0[16][2]; ++ int16_t chroma_offset_l0[16][2]; ++ ++ int16_t luma_weight_l1[16]; ++ int16_t luma_offset_l1[16]; ++ int16_t chroma_weight_l1[16][2]; ++ int16_t chroma_offset_l1[16][2]; ++ ++} RpiSliceHeader; ++ ++typedef struct HEVCRpiWindow { ++ uint16_t left_offset; ++ uint16_t right_offset; ++ uint16_t top_offset; ++ uint16_t bottom_offset; ++} HEVCRpiWindow; ++ ++typedef struct VUI { ++ AVRational sar; ++ ++ int overscan_info_present_flag; ++ int overscan_appropriate_flag; ++ ++ int video_signal_type_present_flag; ++ int video_format; ++ int video_full_range_flag; ++ int colour_description_present_flag; ++ uint8_t colour_primaries; ++ uint8_t transfer_characteristic; ++ uint8_t matrix_coeffs; ++ ++ int chroma_loc_info_present_flag; ++ int chroma_sample_loc_type_top_field; ++ int chroma_sample_loc_type_bottom_field; ++ int neutra_chroma_indication_flag; ++ ++ int field_seq_flag; ++ int frame_field_info_present_flag; ++ ++ int default_display_window_flag; ++ HEVCRpiWindow def_disp_win; ++ ++ int vui_timing_info_present_flag; ++ uint32_t vui_num_units_in_tick; ++ uint32_t vui_time_scale; ++ int vui_poc_proportional_to_timing_flag; ++ int vui_num_ticks_poc_diff_one_minus1; ++ int vui_hrd_parameters_present_flag; ++ ++ int bitstream_restriction_flag; ++ int tiles_fixed_structure_flag; ++ int motion_vectors_over_pic_boundaries_flag; ++ int restricted_ref_pic_lists_flag; ++ int min_spatial_segmentation_idc; ++ int max_bytes_per_pic_denom; ++ int max_bits_per_min_cu_denom; ++ int log2_max_mv_length_horizontal; ++ int log2_max_mv_length_vertical; ++} VUI; ++ ++typedef struct PTLCommon { ++ uint8_t profile_space; ++ uint8_t tier_flag; ++ uint8_t profile_idc; ++ uint8_t profile_compatibility_flag[32]; ++ uint8_t level_idc; ++ uint8_t progressive_source_flag; ++ uint8_t interlaced_source_flag; ++ uint8_t non_packed_constraint_flag; ++ uint8_t frame_only_constraint_flag; ++} PTLCommon; ++ ++typedef struct PTL { ++ PTLCommon general_ptl; ++ PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS]; ++ ++ uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS]; ++ uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS]; ++} PTL; ++ ++typedef struct HEVCRpiVPS { ++ uint8_t vps_temporal_id_nesting_flag; ++ int vps_max_layers; ++ int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1 ++ ++ PTL ptl; ++ int vps_sub_layer_ordering_info_present_flag; ++ unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS]; ++ unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS]; ++ unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS]; ++ int vps_max_layer_id; ++ int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1 ++ uint8_t vps_timing_info_present_flag; ++ uint32_t vps_num_units_in_tick; ++ uint32_t vps_time_scale; ++ uint8_t vps_poc_proportional_to_timing_flag; ++ int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1 ++ int vps_num_hrd_parameters; ++ ++ uint8_t data[4096]; ++ int data_size; ++} HEVCRpiVPS; ++ ++typedef struct ScalingList { ++ /* This is a little wasteful, since sizeID 0 only needs 8 coeffs, ++ * and size ID 3 only has 2 arrays, not 6. */ ++ uint8_t sl[4][6][64]; ++ uint8_t sl_dc[2][6]; ++} ScalingList; ++ ++typedef struct HEVCRpiSPS { ++ unsigned vps_id; ++ uint8_t chroma_format_idc; ++ uint8_t separate_colour_plane_flag; ++ ++ HEVCRpiWindow output_window; ++ ++ HEVCRpiWindow pic_conf_win; ++ ++ uint16_t wp_offset_half_range; // WpOffsetHalfRange ++ ++ uint8_t bit_depth; ++ ++// int bit_depth_chroma; // We only support lum_bit_depth = chroma_bit_depth ++ uint8_t pixel_shift; ++ enum AVPixelFormat pix_fmt; ++ ++ unsigned int log2_max_poc_lsb; ++ ++ int max_sub_layers; ++ struct { ++ int max_dec_pic_buffering; ++ int num_reorder_pics; ++ int max_latency_increase; ++ } temporal_layer[HEVC_MAX_SUB_LAYERS]; ++ uint8_t temporal_id_nesting_flag; ++ ++ uint8_t scaling_list_enable_flag; ++ ScalingList scaling_list; ++ ++ unsigned int nb_st_rps; ++ ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_REF_PIC_SETS]; ++ ++ uint8_t amp_enabled_flag; ++ uint8_t sao_enabled; ++ ++ uint8_t long_term_ref_pics_present_flag; ++ uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS]; ++ uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS]; ++ uint8_t num_long_term_ref_pics_sps; ++ ++ struct { ++ uint8_t bit_depth; ++ uint8_t bit_depth_chroma; ++ uint8_t log2_min_pcm_cb_size; ++ uint8_t log2_max_pcm_cb_size; ++ uint8_t loop_filter_disable_flag; ++ } pcm; ++ char sps_temporal_mvp_enabled_flag; ++// char sps_strong_intra_smoothing_enable_flag; -> intra_filtes_disable ++ ++ uint8_t log2_min_cb_size; // 3..6 ++ uint8_t log2_diff_max_min_coding_block_size; ++ uint8_t log2_min_tb_size; // 2..5 ++ uint8_t log2_max_trafo_size; ++ uint8_t log2_ctb_size; // 4..6 ++// unsigned int log2_min_pu_size; // 2..5 (min_cb_size - 1) ++#define LOG2_MIN_PU_SIZE 2 ++#define LOG2_MIN_CU_SIZE 3 ++ ++ uint8_t max_transform_hierarchy_depth_inter; ++ uint8_t max_transform_hierarchy_depth_intra; ++ ++ char transform_skip_rotation_enabled_flag; ++ char transform_skip_context_enabled_flag; ++ char implicit_rdpcm_enabled_flag; ++ char explicit_rdpcm_enabled_flag; ++// char intra_smoothing_disabled_flag; -> intra_filtes_disable ++ char high_precision_offsets_enabled_flag; ++ char persistent_rice_adaptation_enabled_flag; ++ ++ uint8_t intra_filters_disable; ++ ++ ///< coded frame dimension in various units ++ int width; ++ int height; ++ int ctb_width; ++ int ctb_height; ++ int ctb_size; // Pic size in CTBs not size of a CTB ++ int min_cb_width; ++ int min_cb_height; ++ int min_tb_width; ++ int min_tb_height; ++ int min_pu_width; ++ int min_pu_height; ++ int pcm_width; ++ int pcm_height; ++ int tb_mask; ++ ++ int hshift[3]; ++ int vshift[3]; ++ ++ int qp_bd_offset; ++ ++ uint8_t data[4096]; ++ int data_size; ++ ++ VUI vui; ++ PTL ptl; ++} HEVCRpiSPS; ++ ++#define CTB_TS_FLAGS_SOTL (1U << 0) // X start of tile line ++#define CTB_TS_FLAGS_EOTL (1U << 1) // Last CTB of a tile line ++#define CTB_TS_FLAGS_EOL (1U << 2) // Last CTB of a complete line ++#define CTB_TS_FLAGS_EOT (1U << 3) // Last CTB of a tile ++#define CTB_TS_FLAGS_CSAVE (1U << 4) ++#define CTB_TS_FLAGS_CIREQ (1U << 5) // Cabac init request ++#define CTB_TS_FLAGS_TOT (1U << 6) // CTB on top row of a tile ++#define CTB_TS_FLAGS_CLOAD (1U << 7) ++ ++typedef struct HEVCRpiPPS { ++ unsigned int sps_id; ///< seq_parameter_set_id ++ ++ uint8_t sign_data_hiding_flag; ++ ++ uint8_t cabac_init_present_flag; ++ ++ int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1 ++ int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1 ++ int pic_init_qp_minus26; ++ ++ uint8_t constrained_intra_pred_flag; ++ uint8_t transform_skip_enabled_flag; ++ ++ uint8_t cu_qp_delta_enabled_flag; ++ uint8_t log2_min_cu_qp_delta_size; ++ int cb_qp_offset; // -12..12 ++ int cr_qp_offset; // -12..12 ++ const uint8_t * qp_dblk_x[3]; ++ const int8_t * qp_bd_x[3]; ++ ++ uint8_t pic_slice_level_chroma_qp_offsets_present_flag; ++ uint8_t weighted_pred_flag; ++ uint8_t weighted_bipred_flag; ++ uint8_t output_flag_present_flag; ++ uint8_t transquant_bypass_enable_flag; ++ ++ uint8_t dependent_slice_segments_enabled_flag; ++ uint8_t tiles_enabled_flag; ++ uint8_t entropy_coding_sync_enabled_flag; ++ ++ uint8_t tile_wpp_inter_disable; ++ int num_tile_columns; ///< num_tile_columns_minus1 + 1 ++ int num_tile_rows; ///< num_tile_rows_minus1 + 1 ++ uint8_t uniform_spacing_flag; ++ uint8_t loop_filter_across_tiles_enabled_flag; ++ ++ uint8_t seq_loop_filter_across_slices_enabled_flag; ++ ++ uint8_t deblocking_filter_control_present_flag; ++ uint8_t deblocking_filter_override_enabled_flag; ++ uint8_t disable_dbf; ++ int beta_offset; ///< beta_offset_div2 * 2 ++ int tc_offset; ///< tc_offset_div2 * 2 ++ ++ uint8_t scaling_list_data_present_flag; ++ ScalingList scaling_list; ++ ++ uint8_t lists_modification_present_flag; ++ int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2 ++ int num_extra_slice_header_bits; ++ uint8_t slice_header_extension_present_flag; ++ uint8_t log2_max_transform_skip_block_size; ++ uint8_t cross_component_prediction_enabled_flag; ++ uint8_t chroma_qp_offset_list_enabled_flag; ++ uint8_t diff_cu_chroma_qp_offset_depth; ++ uint8_t chroma_qp_offset_list_len_minus1; ++ int8_t cb_qp_offset_list[6]; ++ int8_t cr_qp_offset_list[6]; ++ uint8_t log2_sao_offset_scale_luma; ++ uint8_t log2_sao_offset_scale_chroma; ++ ++ // Inferred parameters ++ uint16_t *column_width; ///< ColumnWidth ++ uint16_t *row_height; ///< RowHeight ++ uint16_t *col_bd; ///< ColBd ++ uint16_t *row_bd; ///< RowBd ++ uint16_t *col_idxX; ++ ++ // We can limit these to uint16_t given our other size limits ++ uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS ++ uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS ++ uint16_t *tile_id; ///< TileId ++ uint16_t *tile_pos_ts; ///< TilePosRS ++ uint16_t *tile_size; ///< TileSize ++ uint8_t * ctb_ts_flags; ++ ++ uint8_t data[4096]; ++ int data_size; ++} HEVCRpiPPS; ++ ++typedef struct HEVCRpiParamSets { ++ /* currently active parameter sets */ ++ const HEVCRpiVPS *vps; ++ const HEVCRpiSPS *sps; ++ const HEVCRpiPPS *pps; ++ ++ AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT]; ++ AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT]; ++ AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT]; ++} HEVCRpiParamSets; ++ ++int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiParamSets *ps); ++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiParamSets *ps, int apply_defdispwin); ++int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiParamSets *ps); ++ ++int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx, ++ ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header); ++ ++int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id, ++ uint8_t *buf, int buf_size); ++ ++/** ++ * Compute POC of the current frame and return it. ++ */ ++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type); ++ ++#endif /* AVCODEC_RPI_HEVC_PS_H */ +diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c +new file mode 100644 +index 0000000000..8cc5796cf0 +--- /dev/null ++++ b/libavcodec/rpi_hevc_refs.c +@@ -0,0 +1,485 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/avassert.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/rpi_sand_fns.h" ++#include "internal.h" ++#include "thread.h" ++#include "hevc.h" ++#include "rpi_hevcdec.h" ++ ++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags) ++{ ++ /* frame->frame can be NULL if context init failed */ ++ if (!frame->frame || !frame->frame->buf[0]) ++ return; ++ ++ frame->flags &= ~flags; ++ if (!frame->flags) { ++ ff_thread_release_buffer(s->avctx, &frame->tf); ++ ++ av_buffer_unref(&frame->col_mvf_buf); // OK if already NULL ++ frame->col_mvf = NULL; ++ ++ frame->collocated_ref = NULL; ++ } ++} ++ ++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s) ++{ ++ int i; ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ++ HEVC_FRAME_FLAG_SHORT_REF | ++ HEVC_FRAME_FLAG_LONG_REF); ++} ++ ++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s) ++{ ++ int i; ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); ++} ++ ++static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s) ++{ ++ int i, ret; ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame * const frame = &s->DPB[i]; ++ if (frame->frame->buf[0]) ++ continue; ++ ++ ret = ff_thread_get_buffer(s->avctx, &frame->tf, ++ AV_GET_BUFFER_FLAG_REF); ++ if (ret < 0) ++ return NULL; ++ ++ frame->col_mvf = NULL; ++ frame->col_mvf_buf = NULL; ++ if (s->used_for_ref && !s->is_irap) ++ { ++ frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool); ++ if (!frame->col_mvf_buf) ++ goto fail; ++ frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data; ++ } ++ ++ frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD; ++ frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD); ++ ++ return frame; ++ ++fail: ++ ff_hevc_rpi_unref_frame(s, frame, ~0); ++ return NULL; ++ } ++ av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n"); ++ return NULL; ++} ++ ++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc) ++{ ++ HEVCRpiFrame *ref; ++ int i; ++ ++ /* check that this POC doesn't already exist */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *frame = &s->DPB[i]; ++ ++ if (frame->frame->buf[0] && frame->sequence == s->seq_decode && ++ frame->poc == poc) { ++ av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n", ++ poc); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ ref = alloc_frame(s); ++ if (!ref) ++ return AVERROR(ENOMEM); ++ ++ *frame = ref->frame; ++ s->ref = ref; ++ ++ if (s->sh.pic_output_flag) ++ ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF; ++ else ++ ref->flags = HEVC_FRAME_FLAG_SHORT_REF; ++ ++ ref->poc = poc; ++ ref->sequence = s->seq_decode; ++ ref->frame->crop_left = s->ps.sps->output_window.left_offset; ++ ref->frame->crop_right = s->ps.sps->output_window.right_offset; ++ ref->frame->crop_top = s->ps.sps->output_window.top_offset; ++ ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset; ++ ++ return 0; ++} ++ ++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush) ++{ ++ do { ++ int nb_output = 0; ++ int min_poc = INT_MAX; ++ int i, min_idx, ret; ++ ++ if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) { ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *frame = &s->DPB[i]; ++ if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc && ++ frame->sequence == s->seq_output) { ++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT); ++ } ++ } ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *frame = &s->DPB[i]; ++ if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) && ++ frame->sequence == s->seq_output) { ++ nb_output++; ++ if (frame->poc < min_poc || nb_output == 1) { ++ min_poc = frame->poc; ++ min_idx = i; ++ } ++ } ++ } ++ ++ /* wait for more frames before output */ ++ if (!flush && s->seq_output == s->seq_decode && s->ps.sps && ++ nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics) ++ return 0; ++ ++ if (nb_output) { ++ HEVCRpiFrame *frame = &s->DPB[min_idx]; ++ if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1) ++ return 0; ++ ++ ret = av_frame_ref(out, frame->frame); ++ if (frame->flags & HEVC_FRAME_FLAG_BUMPING) ++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING); ++ else ++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT); ++ if (ret < 0) ++ return ret; ++ av_log(s->avctx, AV_LOG_DEBUG, ++ "Output frame with POC %d.\n", frame->poc); ++ return 1; ++ } ++ ++ if (s->seq_output != s->seq_decode) ++ s->seq_output = (s->seq_output + 1) & 0xff; ++ else ++ break; ++ } while (1); ++ ++ return 0; ++} ++ ++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s) ++{ ++ int dpb = 0; ++ int min_poc = INT_MAX; ++ int i; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *frame = &s->DPB[i]; ++ if ((frame->flags) && ++ frame->sequence == s->seq_output && ++ frame->poc != s->poc) { ++ dpb++; ++ } ++ } ++ ++ if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) { ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *frame = &s->DPB[i]; ++ if ((frame->flags) && ++ frame->sequence == s->seq_output && ++ frame->poc != s->poc) { ++ if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) { ++ min_poc = frame->poc; ++ } ++ } ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *frame = &s->DPB[i]; ++ if (frame->flags & HEVC_FRAME_FLAG_OUTPUT && ++ frame->sequence == s->seq_output && ++ frame->poc <= min_poc) { ++ frame->flags |= HEVC_FRAME_FLAG_BUMPING; ++ } ++ } ++ ++ dpb--; ++ } ++} ++ ++static int init_slice_rpl(HEVCRpiContext *s) ++{ ++ if (s->slice_idx >= s->rpl_tab_size) ++ return AVERROR_INVALIDDATA; ++ ++ s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0; ++ return 0; ++} ++ ++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s) ++{ ++ RpiSliceHeader *sh = &s->sh; ++ ++ uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1; ++ uint8_t list_idx; ++ int i, j, ret; ++ ++ ret = init_slice_rpl(s); ++ if (ret < 0) ++ return ret; ++ ++ if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs + ++ s->rps[LT_CURR].nb_refs)) { ++ av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ for (list_idx = 0; list_idx < nb_list; list_idx++) { ++ RefPicList rpl_tmp = { { 0 } }; ++ RefPicList *rpl = &s->refPicList[list_idx]; ++ ++ /* The order of the elements is ++ * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and ++ * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */ ++ int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF, ++ list_idx ? ST_CURR_BEF : ST_CURR_AFT, ++ LT_CURR }; ++ ++ /* concatenate the candidate lists for the current frame */ ++ while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) { ++ for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) { ++ RefPicList *rps = &s->rps[cand_lists[i]]; ++ for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) { ++ rpl_tmp.list[rpl_tmp.nb_refs] = rps->list[j]; ++ rpl_tmp.ref[rpl_tmp.nb_refs] = rps->ref[j]; ++ rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2; ++ rpl_tmp.nb_refs++; ++ } ++ } ++ } ++ ++ /* reorder the references if necessary */ ++ if (sh->rpl_modification_flag[list_idx]) { ++ for (i = 0; i < sh->nb_refs[list_idx]; i++) { ++ int idx = sh->list_entry_lx[list_idx][i]; ++ ++ if (idx >= rpl_tmp.nb_refs) { ++ av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ rpl->list[i] = rpl_tmp.list[idx]; ++ rpl->ref[i] = rpl_tmp.ref[idx]; ++ rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx]; ++ rpl->nb_refs++; ++ } ++ } else { ++ memcpy(rpl, &rpl_tmp, sizeof(*rpl)); ++ rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]); ++ } ++ ++ if (sh->collocated_list == list_idx && ++ sh->collocated_ref_idx < rpl->nb_refs) ++ s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx]; ++ } ++ ++ return 0; ++} ++ ++static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc) ++{ ++ int i; ++ int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *ref = &s->DPB[i]; ++ if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) { ++ if ((ref->poc & LtMask) == poc) ++ return ref; ++ } ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *ref = &s->DPB[i]; ++ if (ref->frame->buf[0] && ref->sequence == s->seq_decode) { ++ if (ref->poc == poc || (ref->poc & LtMask) == poc) ++ return ref; ++ } ++ } ++ ++ if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s)) ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Could not find ref with POC %d\n", poc); ++ return NULL; ++} ++ ++static void mark_ref(HEVCRpiFrame *frame, int flag) ++{ ++ frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF); ++ frame->flags |= flag; ++} ++ ++static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc) ++{ ++ HEVCRpiFrame *frame; ++ int i, x, y; ++ ++ frame = alloc_frame(s); ++ if (!frame) ++ return NULL; ++ ++ if (!s->ps.sps->pixel_shift) { ++ for (i = 0; frame->frame->buf[i]; i++) ++ memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1), ++ frame->frame->buf[i]->size); ++ } else { ++ for (i = 0; frame->frame->data[i]; i++) ++ for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++) ++ for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) { ++ AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x, ++ 1 << (s->ps.sps->bit_depth - 1)); ++ } ++ } ++ ++ frame->poc = poc; ++ frame->sequence = s->seq_decode; ++ frame->flags = 0; ++ ++ ff_hevc_rpi_progress_set_all_done(frame); ++ ++ return frame; ++} ++ ++/* add a reference with the given poc to the list and mark it as used in DPB */ ++static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list, ++ int poc, int ref_flag) ++{ ++ HEVCRpiFrame *ref = find_ref_idx(s, poc); ++ ++ if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS) ++ return AVERROR_INVALIDDATA; ++ ++ if (!ref) { ++ ref = generate_missing_ref(s, poc); ++ if (!ref) ++ return AVERROR(ENOMEM); ++ } ++ ++ list->list[list->nb_refs] = ref->poc; ++ list->ref[list->nb_refs] = ref; ++ list->nb_refs++; ++ ++ mark_ref(ref, ref_flag); ++ return 0; ++} ++ ++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s) ++{ ++ const ShortTermRPS *short_rps = s->sh.short_term_rps; ++ const LongTermRPS *long_rps = &s->sh.long_term_rps; ++ RefPicList *rps = s->rps; ++ int i, ret = 0; ++ ++ if (!short_rps) { ++ rps[0].nb_refs = rps[1].nb_refs = 0; ++ return 0; ++ } ++ ++ /* clear the reference flags on all frames except the current one */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *frame = &s->DPB[i]; ++ ++ if (frame == s->ref) ++ continue; ++ ++ mark_ref(frame, 0); ++ } ++ ++ for (i = 0; i < NB_RPS_TYPE; i++) ++ rps[i].nb_refs = 0; ++ ++ /* add the short refs */ ++ for (i = 0; i < short_rps->num_delta_pocs; i++) { ++ int poc = s->poc + short_rps->delta_poc[i]; ++ int list; ++ ++ if (!short_rps->used[i]) ++ list = ST_FOLL; ++ else if (i < short_rps->num_negative_pics) ++ list = ST_CURR_BEF; ++ else ++ list = ST_CURR_AFT; ++ ++ ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF); ++ if (ret < 0) ++ goto fail; ++ } ++ ++ /* add the long refs */ ++ for (i = 0; i < long_rps->nb_refs; i++) { ++ int poc = long_rps->poc[i]; ++ int list = long_rps->used[i] ? LT_CURR : LT_FOLL; ++ ++ ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF); ++ if (ret < 0) ++ goto fail; ++ } ++ ++fail: ++ /* release any frames that are now unused */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0); ++ ++ return ret; ++} ++ ++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s) ++{ ++ int ret = 0; ++ int i; ++ const ShortTermRPS *rps = s->sh.short_term_rps; ++ LongTermRPS *long_rps = &s->sh.long_term_rps; ++ ++ if (rps) { ++ for (i = 0; i < rps->num_negative_pics; i++) ++ ret += !!rps->used[i]; ++ for (; i < rps->num_delta_pocs; i++) ++ ret += !!rps->used[i]; ++ } ++ ++ if (long_rps) { ++ for (i = 0; i < long_rps->nb_refs; i++) ++ ret += !!long_rps->used[i]; ++ } ++ return ret; ++} +diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c +new file mode 100644 +index 0000000000..cd8149d58e +--- /dev/null ++++ b/libavcodec/rpi_hevc_sei.c +@@ -0,0 +1,368 @@ ++/* ++ * HEVC Supplementary Enhancement Information messages ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2013 Vittorio Giovara ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "golomb.h" ++#include "rpi_hevc_ps.h" ++#include "rpi_hevc_sei.h" ++ ++static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb) ++{ ++ int cIdx, i; ++ uint8_t hash_type; ++ //uint16_t picture_crc; ++ //uint32_t picture_checksum; ++ hash_type = get_bits(gb, 8); ++ ++ for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) { ++ if (hash_type == 0) { ++ s->is_md5 = 1; ++ for (i = 0; i < 16; i++) ++ s->md5[cIdx][i] = get_bits(gb, 8); ++ } else if (hash_type == 1) { ++ // picture_crc = get_bits(gb, 16); ++ skip_bits(gb, 16); ++ } else if (hash_type == 2) { ++ // picture_checksum = get_bits_long(gb, 32); ++ skip_bits(gb, 32); ++ } ++ } ++ return 0; ++} ++ ++static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb) ++{ ++ int i; ++ // Mastering primaries ++ for (i = 0; i < 3; i++) { ++ s->display_primaries[i][0] = get_bits(gb, 16); ++ s->display_primaries[i][1] = get_bits(gb, 16); ++ } ++ // White point (x, y) ++ s->white_point[0] = get_bits(gb, 16); ++ s->white_point[1] = get_bits(gb, 16); ++ ++ // Max and min luminance of mastering display ++ s->max_luminance = get_bits_long(gb, 32); ++ s->min_luminance = get_bits_long(gb, 32); ++ ++ // As this SEI message comes before the first frame that references it, ++ // initialize the flag to 2 and decrement on IRAP access unit so it ++ // persists for the coded video sequence (e.g., between two IRAPs) ++ s->present = 2; ++ return 0; ++} ++ ++static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb) ++{ ++ // Max and average light levels ++ s->max_content_light_level = get_bits_long(gb, 16); ++ s->max_pic_average_light_level = get_bits_long(gb, 16); ++ // As this SEI message comes before the first frame that references it, ++ // initialize the flag to 2 and decrement on IRAP access unit so it ++ // persists for the coded video sequence (e.g., between two IRAPs) ++ s->present = 2; ++ return 0; ++} ++ ++static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb) ++{ ++ get_ue_golomb_long(gb); // frame_packing_arrangement_id ++ s->present = !get_bits1(gb); ++ ++ if (s->present) { ++ s->arrangement_type = get_bits(gb, 7); ++ s->quincunx_subsampling = get_bits1(gb); ++ s->content_interpretation_type = get_bits(gb, 6); ++ ++ // spatial_flipping_flag, frame0_flipped_flag, field_views_flag ++ skip_bits(gb, 3); ++ s->current_frame_is_frame0_flag = get_bits1(gb); ++ // frame0_self_contained_flag, frame1_self_contained_flag ++ skip_bits(gb, 2); ++ ++ if (!s->quincunx_subsampling && s->arrangement_type != 5) ++ skip_bits(gb, 16); // frame[01]_grid_position_[xy] ++ skip_bits(gb, 8); // frame_packing_arrangement_reserved_byte ++ skip_bits1(gb); // frame_packing_arrangement_persistence_flag ++ } ++ skip_bits1(gb); // upsampled_aspect_ratio_flag ++ return 0; ++} ++ ++static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb) ++{ ++ s->present = !get_bits1(gb); ++ ++ if (s->present) { ++ s->hflip = get_bits1(gb); // hor_flip ++ s->vflip = get_bits1(gb); // ver_flip ++ ++ s->anticlockwise_rotation = get_bits(gb, 16); ++ skip_bits1(gb); // display_orientation_persistence_flag ++ } ++ ++ return 0; ++} ++ ++static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps, ++ void *logctx, int size) ++{ ++ HEVCSEIPictureTiming *h = &s->picture_timing; ++ HEVCRpiSPS *sps; ++ ++ if (!ps->sps_list[s->active_seq_parameter_set_id]) ++ return(AVERROR(ENOMEM)); ++ sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data; ++ ++ if (sps->vui.frame_field_info_present_flag) { ++ int pic_struct = get_bits(gb, 4); ++ h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN; ++ if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) { ++ av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n"); ++ h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD; ++ } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) { ++ av_log(logctx, AV_LOG_DEBUG, "TOP Field\n"); ++ h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD; ++ } ++ get_bits(gb, 2); // source_scan_type ++ get_bits(gb, 1); // duplicate_flag ++ skip_bits1(gb); ++ size--; ++ } ++ skip_bits_long(gb, 8 * size); ++ ++ return 0; ++} ++ ++static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb, ++ int size) ++{ ++ int flag; ++ int user_data_type_code; ++ int cc_count; ++ ++ if (size < 3) ++ return AVERROR(EINVAL); ++ ++ user_data_type_code = get_bits(gb, 8); ++ if (user_data_type_code == 0x3) { ++ skip_bits(gb, 1); // reserved ++ ++ flag = get_bits(gb, 1); // process_cc_data_flag ++ if (flag) { ++ skip_bits(gb, 1); ++ cc_count = get_bits(gb, 5); ++ skip_bits(gb, 8); // reserved ++ size -= 2; ++ ++ if (cc_count && size >= cc_count * 3) { ++ const uint64_t new_size = (s->a53_caption_size + cc_count ++ * UINT64_C(3)); ++ int i, ret; ++ ++ if (new_size > INT_MAX) ++ return AVERROR(EINVAL); ++ ++ /* Allow merging of the cc data from two fields. */ ++ ret = av_reallocp(&s->a53_caption, new_size); ++ if (ret < 0) ++ return ret; ++ ++ for (i = 0; i < cc_count; i++) { ++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); ++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); ++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); ++ } ++ skip_bits(gb, 8); // marker_bits ++ } ++ } ++ } else { ++ int i; ++ for (i = 0; i < size - 1; i++) ++ skip_bits(gb, 8); ++ } ++ ++ return 0; ++} ++ ++static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb, ++ int size) ++{ ++ uint32_t country_code; ++ uint32_t user_identifier; ++ ++ if (size < 7) ++ return AVERROR(EINVAL); ++ size -= 7; ++ ++ country_code = get_bits(gb, 8); ++ if (country_code == 0xFF) { ++ skip_bits(gb, 8); ++ size--; ++ } ++ ++ skip_bits(gb, 8); ++ skip_bits(gb, 8); ++ ++ user_identifier = get_bits_long(gb, 32); ++ ++ switch (user_identifier) { ++ case MKBETAG('G', 'A', '9', '4'): ++ return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size); ++ default: ++ skip_bits_long(gb, size * 8); ++ break; ++ } ++ return 0; ++} ++ ++static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx) ++{ ++ int num_sps_ids_minus1; ++ int i; ++ unsigned active_seq_parameter_set_id; ++ ++ get_bits(gb, 4); // active_video_parameter_set_id ++ get_bits(gb, 1); // self_contained_cvs_flag ++ get_bits(gb, 1); // num_sps_ids_minus1 ++ num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1 ++ ++ if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) { ++ av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ active_seq_parameter_set_id = get_ue_golomb_long(gb); ++ if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) { ++ av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id); ++ return AVERROR_INVALIDDATA; ++ } ++ s->active_seq_parameter_set_id = active_seq_parameter_set_id; ++ ++ for (i = 1; i <= num_sps_ids_minus1; i++) ++ get_ue_golomb_long(gb); // active_seq_parameter_set_id[i] ++ ++ return 0; ++} ++ ++static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb) ++{ ++ s->present = 1; ++ s->preferred_transfer_characteristics = get_bits(gb, 8); ++ return 0; ++} ++ ++static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps, ++ int type, int size) ++{ ++ switch (type) { ++ case 256: // Mismatched value from HM 8.1 ++ return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb); ++ case HEVC_SEI_TYPE_FRAME_PACKING: ++ return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb); ++ case HEVC_SEI_TYPE_DISPLAY_ORIENTATION: ++ return decode_nal_sei_display_orientation(&s->display_orientation, gb); ++ case HEVC_SEI_TYPE_PICTURE_TIMING: ++ return decode_nal_sei_pic_timing(s, gb, ps, logctx, size); ++ case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO: ++ return decode_nal_sei_mastering_display_info(&s->mastering_display, gb); ++ case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO: ++ return decode_nal_sei_content_light_info(&s->content_light, gb); ++ case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS: ++ return decode_nal_sei_active_parameter_sets(s, gb, logctx); ++ case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35: ++ return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size); ++ case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS: ++ return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb); ++ default: ++ av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type); ++ skip_bits_long(gb, 8 * size); ++ return 0; ++ } ++} ++ ++static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, ++ int type, int size) ++{ ++ switch (type) { ++ case HEVC_SEI_TYPE_DECODED_PICTURE_HASH: ++ return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb); ++ default: ++ av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type); ++ skip_bits_long(gb, 8 * size); ++ return 0; ++ } ++} ++ ++static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s, ++ const HEVCRpiParamSets * const ps, const int nal_unit_type) ++{ ++ int payload_type = 0; ++ int payload_size = 0; ++ int byte = 0xFF; ++ av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n"); ++ ++ while (byte == 0xFF) { ++ if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255) ++ return AVERROR_INVALIDDATA; ++ byte = get_bits(gb, 8); ++ payload_type += byte; ++ } ++ byte = 0xFF; ++ while (byte == 0xFF) { ++ if (get_bits_left(gb) < 8 + 8LL*payload_size) ++ return AVERROR_INVALIDDATA; ++ byte = get_bits(gb, 8); ++ payload_size += byte; ++ } ++ if (nal_unit_type == HEVC_NAL_SEI_PREFIX) { ++ return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size); ++ } else { /* nal_unit_type == NAL_SEI_SUFFIX */ ++ return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size); ++ } ++} ++ ++static int more_rbsp_data(GetBitContext *gb) ++{ ++ return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80; ++} ++ ++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s, ++ const HEVCRpiParamSets *ps, int type) ++{ ++ int ret; ++ ++ do { ++ ret = decode_nal_sei_message(gb, logctx, s, ps, type); ++ if (ret < 0) ++ return ret; ++ } while (more_rbsp_data(gb)); ++ return 1; ++} ++ ++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s) ++{ ++ s->a53_caption.a53_caption_size = 0; ++ av_freep(&s->a53_caption.a53_caption); ++} +diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h +new file mode 100644 +index 0000000000..d4ac348df9 +--- /dev/null ++++ b/libavcodec/rpi_hevc_sei.h +@@ -0,0 +1,135 @@ ++/* ++ * HEVC Supplementary Enhancement Information messages ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVC_SEI_H ++#define AVCODEC_RPI_HEVC_SEI_H ++ ++#include ++ ++#include "libavutil/md5.h" ++ ++#include "get_bits.h" ++ ++/** ++ * SEI message types ++ */ ++typedef enum { ++ HEVC_SEI_TYPE_BUFFERING_PERIOD = 0, ++ HEVC_SEI_TYPE_PICTURE_TIMING = 1, ++ HEVC_SEI_TYPE_PAN_SCAN_RECT = 2, ++ HEVC_SEI_TYPE_FILLER_PAYLOAD = 3, ++ HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35 = 4, ++ HEVC_SEI_TYPE_USER_DATA_UNREGISTERED = 5, ++ HEVC_SEI_TYPE_RECOVERY_POINT = 6, ++ HEVC_SEI_TYPE_SCENE_INFO = 9, ++ HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT = 15, ++ HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16, ++ HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END = 17, ++ HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS = 19, ++ HEVC_SEI_TYPE_POST_FILTER_HINT = 22, ++ HEVC_SEI_TYPE_TONE_MAPPING_INFO = 23, ++ HEVC_SEI_TYPE_FRAME_PACKING = 45, ++ HEVC_SEI_TYPE_DISPLAY_ORIENTATION = 47, ++ HEVC_SEI_TYPE_SOP_DESCRIPTION = 128, ++ HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS = 129, ++ HEVC_SEI_TYPE_DECODING_UNIT_INFO = 130, ++ HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX = 131, ++ HEVC_SEI_TYPE_DECODED_PICTURE_HASH = 132, ++ HEVC_SEI_TYPE_SCALABLE_NESTING = 133, ++ HEVC_SEI_TYPE_REGION_REFRESH_INFO = 134, ++ HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO = 137, ++ HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO = 144, ++ HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147, ++} HEVC_SEI_Type; ++ ++typedef struct HEVCSEIPictureHash { ++ uint8_t md5[3][16]; ++ uint8_t is_md5; ++} HEVCSEIPictureHash; ++ ++typedef struct HEVCSEIFramePacking { ++ int present; ++ int arrangement_type; ++ int content_interpretation_type; ++ int quincunx_subsampling; ++ int current_frame_is_frame0_flag; ++} HEVCSEIFramePacking; ++ ++typedef struct HEVCSEIDisplayOrientation { ++ int present; ++ int anticlockwise_rotation; ++ int hflip, vflip; ++} HEVCSEIDisplayOrientation; ++ ++typedef struct HEVCSEIPictureTiming { ++ int picture_struct; ++} HEVCSEIPictureTiming; ++ ++typedef struct HEVCSEIA53Caption { ++ int a53_caption_size; ++ uint8_t *a53_caption; ++} HEVCSEIA53Caption; ++ ++typedef struct HEVCSEIMasteringDisplay { ++ int present; ++ uint16_t display_primaries[3][2]; ++ uint16_t white_point[2]; ++ uint32_t max_luminance; ++ uint32_t min_luminance; ++} HEVCSEIMasteringDisplay; ++ ++typedef struct HEVCSEIContentLight { ++ int present; ++ uint16_t max_content_light_level; ++ uint16_t max_pic_average_light_level; ++} HEVCSEIContentLight; ++ ++typedef struct HEVCSEIAlternativeTransfer { ++ int present; ++ int preferred_transfer_characteristics; ++} HEVCSEIAlternativeTransfer; ++ ++typedef struct HEVCSEIContext { ++ HEVCSEIPictureHash picture_hash; ++ HEVCSEIFramePacking frame_packing; ++ HEVCSEIDisplayOrientation display_orientation; ++ HEVCSEIPictureTiming picture_timing; ++ HEVCSEIA53Caption a53_caption; ++ HEVCSEIMasteringDisplay mastering_display; ++ HEVCSEIContentLight content_light; ++ int active_seq_parameter_set_id; ++ HEVCSEIAlternativeTransfer alternative_transfer; ++} HEVCSEIContext; ++ ++struct HEVCRpiParamSets; ++ ++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s, ++ const struct HEVCRpiParamSets *ps, int type); ++ ++/** ++ * Reset SEI values that are stored on the Context. ++ * e.g. Caption data that was extracted during NAL ++ * parsing. ++ * ++ * @param s HEVCRpiContext. ++ */ ++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s); ++ ++#endif /* AVCODEC_RPI_HEVC_SEI_H */ +diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c +new file mode 100644 +index 0000000000..23b49a99ae +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader.c +@@ -0,0 +1,1537 @@ ++#include "rpi_hevc_shader.h" ++ ++#ifdef _MSC_VER ++ #include ++ /* cast through uintptr_t to avoid warnings */ ++ #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X)) ++#else ++ #define POINTER_TO_UINT(X) ((unsigned int)(X)) ++#endif ++ ++#ifdef __cplusplus ++extern "C" { /* the types are probably wrong... */ ++#endif ++#ifdef __cplusplus ++} ++#endif ++ ++#ifdef _MSC_VER ++__declspec(align(8)) ++#elif defined(__GNUC__) ++__attribute__((aligned(8))) ++#endif ++unsigned int ff_hevc_rpi_shader[] = { ++// ::mc_setup_c_q0 ++// ::mc_start ++/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_c_qn ++/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif ++/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 ++/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift ++/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 ++/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 ++/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask ++/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif ++/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch ++/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num ++/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 ++/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num ++/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x ++/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a ++/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif ++/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 ++/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000110] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a ++/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif ++/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD ++/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 ++/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y ++// :1 ++/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 ++/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 ++/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 ++/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 ++/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 ++// ::mc_filter_c_p ++/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3 ++/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif ++/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val ++/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 ++/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif ++// :1 ++/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 ++/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 ++/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 ++/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b ++/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a ++/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height ++/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b ++/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_c_p_l1 ++/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3 ++/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif ++/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val ++/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 ++/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif ++// :1 ++/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 ++/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next ++/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 ++/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax ++/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 ++/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b ++/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a ++/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height ++/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b ++/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_c_b ++/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 ++/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif ++/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif ++/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif ++/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height ++/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 ++/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif ++/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif ++/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a ++/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b ++/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif ++/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif ++/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif ++/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y ++/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add ++/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif ++/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4 ++/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif ++/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val ++/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1 ++/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1 ++/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d ++/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif ++/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d ++// :1 ++/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0 ++/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next ++/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next ++/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y ++/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0 ++/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1 ++/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1 ++/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6 ++/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 ++/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax ++/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0 ++/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1 ++/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a ++/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b ++/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b ++/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4 ++/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7 ++/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11 ++/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0 ++/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6 ++/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 ++/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add ++/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height ++/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7 ++/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b ++/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_sync_q0 ++/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q1 ++/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q2 ++/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q3 ++/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync_q4 ++/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q5 ++/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q6 ++/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q7 ++/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync_q8 ++/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q9 ++/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q10 ++/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q11 ++/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c_qn ++// ::mc_exit_y_qn ++/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c_q0 ++// ::mc_exit_y_q0 ++/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 ++/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_setup_y_q0 ++/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_y_qn ++/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif ++/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif ++/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif ++/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 ++/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask ++/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00 ++/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40 ++/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500 ++/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif ++/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif ++/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1 ++/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 ++/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif ++/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch ++/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch ++/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 ++/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0 ++/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a ++/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a ++// :1 ++/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 ++/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000df0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 ++/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 ++/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 ++/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 ++// :per_block_setup_8 ++/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif ++/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif ++/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 ++/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif ++/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif ++/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init ++/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul ++/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 ++/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8) ++/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add ++/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val ++/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif ++/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255 ++/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 ++/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d ++/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c ++/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d ++/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c ++/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 ++/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 ++/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ++/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif ++/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 ++/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d ++/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c ++/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d ++/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c ++/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 ++/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d ++/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 ++/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 ++/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif ++/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 ++// ::mc_filter_y_pxx ++/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5 ++/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 ++// :1 ++/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 ++/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 ++/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 ++/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 ++/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 ++/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 ++/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 ++/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 ++/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height ++/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next ++/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next ++/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b ++/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_y_bxx ++/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6 ++/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 ++/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 ++// :1 ++/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 ++/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 ++/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 ++/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 ++/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 ++/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 ++/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 ++/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 ++/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4 ++/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off ++/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6 ++/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add ++/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next ++/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8 ++/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height ++/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch ++/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3 ++/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b ++/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_y_p00 ++/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num ++/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 ++/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif ++/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a ++/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif ++/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif ++/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif ++/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init ++/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift ++/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7 ++/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif ++/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base ++// :1 ++/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8 ++/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b ++/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_y_b00 ++/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1 ++/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 ++/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 ++/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++// :1 ++/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 ++/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next ++/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax ++/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 ++/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32 ++/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b ++/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_setup_c10_q0 ++/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_c10_qn ++/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif ++/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 ++/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift ++/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 ++/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 ++/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask ++/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif ++/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch ++/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num ++/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 ++/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num ++/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0 ++/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x ++/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a ++/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 ++/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif ++/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 ++/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 ++/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 ++/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 ++/* [0x00001770] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) ++/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) ++/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 ++/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a ++/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif ++/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD ++/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 ++/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y ++// :1 ++/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 ++/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 ++/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 ++/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 ++/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 ++// ::mc_filter_c10_p ++/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif ++/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val ++/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 ++/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif ++// :1 ++/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 ++/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 ++/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 ++/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b ++/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a ++/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height ++/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b ++/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_c10_p_l1 ++/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif ++/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val ++/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 ++/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif ++// :1 ++/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 ++/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next ++/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 ++/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax ++/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 ++/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b ++/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a ++/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height ++/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b ++/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_c10_b ++/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 ++/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif ++/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif ++/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif ++/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height ++/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 ++/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif ++/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif ++/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a ++/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b ++/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif ++/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif ++/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif ++/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y ++/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add ++/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif ++/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4 ++/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif ++/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val ++/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1 ++/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1 ++/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d ++/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif ++/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d ++// :1 ++/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0 ++/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next ++/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next ++/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y ++/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0 ++/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1 ++/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1 ++/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6 ++/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 ++/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax ++/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0 ++/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1 ++/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a ++/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b ++/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b ++/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4 ++/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7 ++/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11 ++/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0 ++/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6 ++/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 ++/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add ++/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height ++/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7 ++/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b ++/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_sync10_q0 ++/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q1 ++/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q2 ++/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q3 ++/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync10_q4 ++/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q5 ++/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q6 ++/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q7 ++/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync10_q8 ++/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q9 ++/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q10 ++/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q11 ++/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c10_q0 ++// ::mc_exit_y10_q0 ++/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 ++/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c10_qn ++// ::mc_exit_y10_qn ++/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_setup_y10_q0 ++/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_y10_qn ++/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif ++/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif ++/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif ++/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 ++/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask ++/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00 ++/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40 ++/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500 ++/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif ++/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif ++/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1 ++/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift ++/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 ++/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif ++/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch ++/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch ++/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 ++/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0 ++/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a ++/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a ++// :1 ++/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 ++/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 ++/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 ++/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 ++/* [0x00002428] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) ++/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) ++/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 ++/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 ++/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 ++/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 ++/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 ++// :per_block_setup_10 ++/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif ++/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif ++/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 ++/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif ++/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif ++/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init ++/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul ++/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 ++/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8) ++/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add ++/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val ++/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif ++/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif ++/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255 ++/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 ++/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d ++/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c ++/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d ++/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c ++/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 ++/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 ++/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ++/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif ++/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 ++/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d ++/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c ++/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d ++/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c ++/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 ++/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d ++/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 ++/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 ++/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif ++/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 ++// ::mc_filter_y10_pxx ++/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5 ++/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 ++// :1 ++/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 ++/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 ++/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 ++/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 ++/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 ++/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 ++/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 ++/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 ++/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height ++/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next ++/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next ++/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b ++/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_y10_p00 ++/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num ++/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 ++/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif ++/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a ++/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif ++/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif ++/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif ++/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init ++/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift ++/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7 ++/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif ++/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base ++// :1 ++/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8 ++/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b ++/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_y10_bxx ++/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6 ++/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 ++/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 ++// :1 ++/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 ++/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 ++/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 ++/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 ++/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 ++/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 ++/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 ++/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 ++/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4 ++/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off ++/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6 ++/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add ++/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next ++/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8 ++/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height ++/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch ++/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3 ++/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b ++/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_y10_b00 ++/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1 ++/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 ++/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 ++/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++// :1 ++/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 ++/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next ++/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax ++/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 ++/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32 ++/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b ++/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_end ++}; ++#ifdef __HIGHC__ ++#pragma Align_to(8, ff_hevc_rpi_shader) ++#endif +diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h +new file mode 100644 +index 0000000000..79651c9b6c +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader.h +@@ -0,0 +1,63 @@ ++#ifndef rpi_hevc_shader_H ++#define rpi_hevc_shader_H ++ ++extern unsigned int ff_hevc_rpi_shader[]; ++ ++#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0) ++#define mc_start (ff_hevc_rpi_shader + 0) ++#define mc_setup_c_qn (ff_hevc_rpi_shader + 2) ++#define mc_filter_c_p (ff_hevc_rpi_shader + 134) ++#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260) ++#define mc_filter_c_b (ff_hevc_rpi_shader + 386) ++#define mc_sync_q0 (ff_hevc_rpi_shader + 580) ++#define mc_sync_q1 (ff_hevc_rpi_shader + 598) ++#define mc_sync_q2 (ff_hevc_rpi_shader + 610) ++#define mc_sync_q3 (ff_hevc_rpi_shader + 622) ++#define mc_sync_q4 (ff_hevc_rpi_shader + 634) ++#define mc_sync_q5 (ff_hevc_rpi_shader + 652) ++#define mc_sync_q6 (ff_hevc_rpi_shader + 664) ++#define mc_sync_q7 (ff_hevc_rpi_shader + 676) ++#define mc_sync_q8 (ff_hevc_rpi_shader + 688) ++#define mc_sync_q9 (ff_hevc_rpi_shader + 706) ++#define mc_sync_q10 (ff_hevc_rpi_shader + 718) ++#define mc_sync_q11 (ff_hevc_rpi_shader + 730) ++#define mc_exit_c_qn (ff_hevc_rpi_shader + 742) ++#define mc_exit_y_qn (ff_hevc_rpi_shader + 742) ++#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760) ++#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760) ++#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780) ++#define mc_setup_y_qn (ff_hevc_rpi_shader + 782) ++#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014) ++#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140) ++#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272) ++#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358) ++#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432) ++#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434) ++#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562) ++#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684) ++#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806) ++#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996) ++#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014) ++#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026) ++#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038) ++#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050) ++#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068) ++#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080) ++#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092) ++#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104) ++#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122) ++#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134) ++#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146) ++#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158) ++#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158) ++#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178) ++#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178) ++#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196) ++#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198) ++#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440) ++#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566) ++#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654) ++#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786) ++#define mc_end (ff_hevc_rpi_shader + 2860) ++ ++#endif +diff --git a/libavcodec/rpi_hevc_shader.qasm b/libavcodec/rpi_hevc_shader.qasm +new file mode 100644 +index 0000000000..af5b59e181 +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader.qasm +@@ -0,0 +1,1850 @@ ++# Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++# All rights reserved. ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions are met: ++# * Redistributions of source code must retain the above copyright ++# notice, this list of conditions and the following disclaimer. ++# * Redistributions in binary form must reproduce the above copyright ++# notice, this list of conditions and the following disclaimer in the ++# documentation and/or other materials provided with the distribution. ++# * Neither the name of the copyright holder nor the ++# names of its contributors may be used to endorse or promote products ++# derived from this software without specific prior written permission. ++# ++# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++# ++# Written by Peter de Rivaz, John Cox ++ ++ ++ ++# Inter pred asm ++# ++# Logic here should be good to 14 bits without modification ++# but only 8 & 10 are currently instantiated & tested ++# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow ++# in _p00 & _b00 ++ ++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress ++# the warning that we are using rotation & ra/rb registers. r0..3 can be ++# rotated through all 16 elems ra regs can only be rotated through their ++# local 4. As it happens this is what is wanted here as we do not want the ++# constants from the other half of the calc. ++ ++# Number limits in P/B calculation ++# ++# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier ++# we offset our intermediates s.t. they always end up +ve before the next ++# multiply (may be -ve whilst summing but that doesn't matter). ++# ++# Range calc for up to 14 bits (Y-B pred): ++# ++# denom: [0, 7] ++# bmax = (1 << bits) - 1 ++# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1] ++# ++# wt_mul: [-128, 255] ++# wt_off = off * 2 + 1: [-bmax, bmax] ++# ++# pel: [0, bmax] ++# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff] ++# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e] ++# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6] ++# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4] ++# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2): ++# [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000] ++# ++# This all looks good and is mostly bit depth independant - and as we manage ++# to do unsigned multiplies everywhere (now) this should be good for any bit ++# depth up to 14 (we could probably do 16 - but that requires a few tweaks ++# to the shifts we don't currently have logic for) ++ ++# PREREAD is the number of requests that we have sitting in the TMU request ++# queue. ++# ++# There are 8 slots availible in the TMU request Q for tm0s requests, but ++# only 4 output FIFO entries and overflow is bad (corruption or crash) ++# (If threaded then only 2 out FIFO entries, but we aren't.) ++# In s/w we are effectively limited to the min vertical read which is >= 4 ++# so output FIFO is the limit. ++# ++# As the test for read-next is is the main part of the Luma loop (rather than ++# the preload FIFO part) we are limited to min_luma_height - 1 ++# Min_luma_height is 4 so we can only have a preload of 3 ++# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick ++# in chroma without abandoning preload pretty much entirely (which would be bad) ++# ++# Timing tests vs preload of 4 suggests this doesn't hurt us much ++# Could have preread 4 for Chroma but when tested it didn't help ++ ++.set PREREAD, 3 ++ ++# Offset added (effectively) at the exit of the H FIR filter ++# This is enough to force the result +ve ++# Is good if it is a power of 2 as that allows for >> without loss ++# ++# Worst case for a single Y FIR is *-22 so we need an offset of 256*22 ++# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00 ++# Round up to next power of 2 ++ ++.set FIR_OFFSET, 0x4000 ++ ++# Block heights - 8 & 16 are the only numbers we currently support ++ ++.set C_BLK_HEIGHT_8, 16 ++.set C_BLK_HEIGHT_16, 8 ++.set Y_BLK_HEIGHT_8, 16 ++.set Y_BLK_HEIGHT_16, 8 ++ ++# QPU counts - depend on block size ++# If we have a 2-byte format & block_size > 8 then can only afford ++# 8 QPUs ++# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h ++ ++.set N_QPU_8, 12 ++.set N_QPU_16, 12 ++ ++# Value to add to the weight multiplier to convert it into an unsigned value ++# Should be power of two for convienience ++ ++.set LOG2_MUL_ADD, 14 ++.set MUL_ADD, (1 << LOG2_MUL_ADD) ++ ++# Fixed denom (max that it can be set to) ++.set DENOM, 7 ++ ++# register allocation ++# ++ ++# ra0-3 ++# Used as temp and may be loop filter coeffs (split into .8s) ++# or temp in loop. Check usage on an individual basis. ++ ++# ra4-11 ++# V FIFO / temp / free ++ ++# -- free -- ra12 ++ ++# -- free -- ra13 ++ ++# -- free -- ra14 ++ ++# -- free -- ra15 ++ ++# uniform: width:height ++.set ra_width_height, ra16 ++.set ra_width, ra16.16b ++.set ra_height, ra16.16a ++ ++# y:y2 same layout as y_y2_next so we can update both together ++.set ra_y_y2, ra17 ++.set ra_y2, ra17.16a ++.set ra_y, ra17.16b ++ ++# uniform: L1 weight (U on left, V on right) ++# Only used in Y B ++.set ra_wt_off_mul_l1, ra18 ++.set ra_wt_off_l1, ra18.16b ++.set ra_wt_mul_l1, ra18.16a ++ ++# y_next:y2_next same layout as y_y2 so we can update both together ++.set ra_y_y2_next, ra19 ++.set ra_y_next, ra19.16b ++.set ra_y2_next, ra19.16a ++ ++# Setup: consts - subdivide a single register ++.set ra_kff800100, ra20 ++.set ra_k256, ra20.16a ++.set ra_k0, ra20.8a ++.set ra_k1, ra20.8b ++.set ra_k128, ra20.8c ++.set ra_k255, ra20.8d ++ ++# Loop: xshifts ++.set ra_xshift, ra21.16a ++.set ra_xshift_next, ra21.16b ++ ++# Loop var: L0 weight (U on left, V on right) ++# _off_ is not used in loop as we want to modify it before use ++.set ra_wt_off_mul_l0, ra22 ++.set ra_wt_mul_l0, ra22.16a ++.set ra_wt_off_l0, ra22.16b ++ ++# Max pel value (for 8 bit we can get away with sat ops but not 9+) ++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the ++# 2nd byte but as the source should never be > 3 there 0x3ff should do ++.set ra_blk_height_pmax, ra23 ++.set ra_pmax, ra23.16a ++.set ra_blk_height, ra23.8c ++# --free -- ra23.8d ++ ++# Loop: src frame base (L0) ++.set ra_base, ra24 ++ ++# Misc offsets ++.set ra_fir_off_val_wt_den_p7, ra25 ++.set ra_wt_den_p7, ra25.8a ++# -- free -- ra25.8b ++.set ra_fir_off_val, ra25.16b ++ ++# As it happens these constants are the same ++.if FIR_OFFSET == MUL_ADD ++# Weight multiplier unsigned add ++.set ra_kmul_add, ra_fir_off_val ++.else ++.error "FIR_OFFSET != MUL_ADD: Need new register & init" ++.endif ++ ++# Loop: next src frame base (L0) ++.set ra_base_next, ra26 ++ ++# Loop: height<<23 + width<<16 + vdw_setup_0 ++.set ra_dma0, ra27 ++ ++# Loop: destination address ++.set ra_dest, ra28 ++ ++# Setup: Dup of rb_ef ++# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul ++# (top bits are ignored by mul24) ++.set ra_ef, ra29 ++ ++# Use an even numbered register as a link register to avoid corrupting flags ++.set ra_link, ra30 ++ ++# -- free -- ra31 ++ ++.set rb_xshift2, rb0 ++.set rb_xshift2_next, rb1 ++ ++# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2 ++.set rb_elem_x, rb2 ++ ++# El Flags ++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n ++# Duped into ra_ef as sometimes that is easier to use ++.set rb_ef, rb3 ++ ++# rb4-11 ++# Loop: V filter FIFO or V filter coeff ++ ++# Loop var: offset to add before shift (round + weighting offsets) ++# Exact value varies by loop ++.set rb_wt_off, rb12 ++ ++# -- free -- rb13 ++ ++# -- free -- rb14 ++ ++# Loop: src frame base (L1) ++.set rb_base2, rb15 ++ ++# Line pitch (128 for sand128) ++.set rb_pitch, rb16 ++ ++# Loop count - 2 (set up TMU for next xfer) ++.set rb_i_tmu, rb17 ++ ++# Loop count for min(height, 16) ++# Y will reset & loop again if height > 16 ++.set rb_lcount, rb18 ++ ++# frame_base2_next ++.set rb_base2_next, rb19 ++ ++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give ++# offset to the slice ++.set rb_xpitch, rb20 ++ ++# These 3 consts each save 1 instruction in Y loop setup ++# so whilst they are worthwhile they should be the 1st to die if we need ++# another b reg ++.set rb_y_coeffs_2, rb21 # 0x050b0a00 ++.set rb_y_coeffs_3, rb22 # 0x11283a40 ++.set rb_y_coeffs_5, rb23 # 0x0a0b0500 ++ ++# Setup: 0xff (8-bit) / 0xffff (9+ bit) ++.set rb_pmask, rb24 ++ ++# vdw_setup_1(dst_pitch) ++.set rb_dma1_base, rb25 ++ ++# Setup: pic width - 1 ++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc. ++.set rb_max_x, rb26 ++ ++# vdw_setup_0 (depends on QPU number) ++.set rb_dma0_base, rb27 ++ ++# Setup: vw_setup value to reset VPM write pointer ++.set rb_vpm_init, rb28 ++ ++# Loop: vdw_setup_1(dst_pitch-width) = stride ++.set rb_dma1, rb29 ++ ++# Setup: pic_height - 1 ++.set rb_max_y, rb30 ++ ++# Setup: FIR H offset ++.set rb_fir_off_h, rb31 ++ ++ ++# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc. ++.set i_shift16, -16 ++.set i_shift21, -11 ++.set i_shift23, -9 ++.set i_shift30, -2 ++ ++# Much of the setup code is common between Y & C ++# Macros that express this - obviously these can't be overlapped ++# so are probably unsuitable for loop code ++ ++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma ++ mov r2, qpu_num ++.if v_bit_depth <= 8 ++ # 8 bit version ++ asr r1, r2, 2 ++ shl r1, r1, 6 ++ and r0, r2, 3 ++ or r0, r0, r1 ++ ++ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit ++ add r_vpm, r0, r1 # VPM 8bit storage ++ ++ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later ++ shl r0, r0, 5 ++ ++.else ++ # 16 bit version ++ # Limited to 8 QPUs if blk height > 8 ++ asr r1, r2, 1 ++.if v_blk_height <= 8 ++ shl r1, r1, 4 ++.else ++ shl r1, r1, 5 ++.endif ++ and r0, r2, 1 ++ or r0, r0, r1 ++ ++ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR ++ add r_vpm, r0, r1 ++ ++ # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into ++ # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg) ++ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later ++ shl r0, r0, 6 ++.endif ++ add r_dma, r0, r1 # DMA out ++.endm ++ ++ ++.macro m_setup_q0 ++ srel -, 12 ++.endm ++ ++# Code start label ++::mc_start ++ ++################################################################################ ++# mc_setup_c ++# ++# typedef struct qpu_mc_pred_c_s_s { ++# int16_t y; ++# int16_t x; ++# uint32_t base; ++# uint32_t pic_cw; // C Width (== Y width / 2) ++# uint32_t pic_ch; // C Height (== Y Height / 2) ++# uint32_t stride2; ++# uint32_t stride1; ++# uint32_t wdenom; ++# int16_t y2; ++# int16_t x2; ++# uint32_t base2; ++# uint32_t next_fn; ++# } qpu_mc_pred_c_s_t; ++ ++.macro m_setup_c, v_bit_depth ++ ++# Cannot use mul24 on x as x might be -ve, so must use shift ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_pmask, 0xff ++.set v_blk_height, C_BLK_HEIGHT_8 ++.else ++.set v_x_shift, 2 ++.set v_pmask, 0xffff ++.set v_blk_height, C_BLK_HEIGHT_16 ++.endif ++ ++ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y ++ ++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++ shl rb_ef, r0, i_shift30 ; mov ra_base, unif # ; ref_c_base ++ ++# Read image dimensions ++ sub r0, unif, 1 # pic c width ++ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes ++ sub rb_max_y, unif, 1 # pic c height ++ ++# load constants ++ mov ra_kff800100, 0xff800100 ++ mov rb_pmask, v_pmask ++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++ mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++ mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++ ++# get source pitch ++ mov ra_ef, rb_ef ; mov rb_xpitch, unif # ; stride2 ++ mov rb_pitch, unif # stride1 ++ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly ++ add rb_dma1_base, r1, rb_pitch # vdw_setup_1 ++ ++ and r0, 1, elem_num ++ nop ; mul24 r0, r0, 5 ++.if v_bit_depth <= 8 ++ add rb_elem_x, r0, elem_num ++.else ++ add r0, r0, elem_num ++ add rb_elem_x, r0, r0 ++.endif ++ ++# Compute base address for first and second access ++# ra_base ends up with t0s base ++# ra_base2 ends up with t1s base ++ ++ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay] ++ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice ++ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y ++ min r0, r0, rb_max_x ++ ++# Get shift ++# Shift will always calculate as 0 for 9+ bit ++# Ideally we can optimize the shift out of the code in these cases but for now ++# it is tidier to leave it in ++.if v_bit_depth <= 8 ++ shl ra_xshift_next, r0, 3 ++.else ++ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 ++.endif ++ ++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to ++ ++.if v_bit_depth <= 8 ++ and r0, r0, -4 ++.endif ++ sub r1, ra_k0, rb_pitch ++ and r1, r0, r1 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra0, unif # ; next_x2_y2 ++ add ra_base, ra_base, r0 ++ ++# Compute part of VPM to use for DMA output ++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop? ++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base ++ ++# And again for L1, but only worrying about frame2 stuff ++ ++# Compute base address for first and second access ++# ra_base ends up with t0s base ++# rb_base2 ends up with t1s base ++ ++ shl r0, ra0.16b, v_x_shift ++ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset ++ max r0, r0, 0 ; mov rb_base2, unif # ref_c_base2 ++ min r0, r0, rb_max_x ++ ++# Get shift (already zero if 9+ bit so ignore) ++.if v_bit_depth <= 8 ++ shl rb_xshift2_next, r0, 3 ++.endif ++ ++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs ++ ++.if v_bit_depth <= 8 ++ and r0, r0, -4 ++.endif ++ sub r1, ra_k0, rb_pitch ++ and r1, r0, r1 ; mov r3, PREREAD ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r2, ra_y2 ++ add rb_base2, rb_base2, r0 ; mov r0, ra_y ++ ++# Do preloads ++# r0 = ra_y, r2 = ra_y2, r3 = PREREAD ++ ++:1 ++ sub.setf r3, r3, 1 ++ max r1, r0, 0 ++ min r1, r1, rb_max_y ++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t0s, ra_base, r1 ; mov ra_y, r0 ++ ++ max r1, r2, 0 ++ brr.anynz -, r:1b ++ min r1, r1, rb_max_y ++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t1s, rb_base2, r1 ; mov ra_y2, r2 ++# >>> .anynz 1b ++ ++ mov ra_link, unif # link ++# touch registers to keep simulator happy (and fills in delay slots) ++ mov ra4, 0 ; mov rb4, 0 ++ bra -, ra_link ++ mov ra5, 0 ; mov rb5, 0 ++ mov ra6, 0 ; mov rb6, 0 ++ mov ra7, 0 ; mov rb7, 0 ++# >>> ra_link ++.endm ++ ++::mc_setup_c_q0 ++ m_setup_q0 ++::mc_setup_c_qn ++ m_setup_c 8 ++ ++################################################################################ ++# ++# mc_filter_c_p ++# ++# typedef struct qpu_mc_pred_c_p_s { ++# int16_t y; ++# int16_t x; ++# uint32_t base; ++# uint16_t h; ++# uint16_t w; ++# uint32_t coeffs_x; ++# uint32_t coeffs_y; ++# uint32_t wo_u; ++# uint32_t wo_v; ++# uint32_t dst_addr_c; ++# uint32_t next_fn; ++# } qpu_mc_pred_c_p_t; ++ ++.macro m_filter_c_p, v_tmu, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++.set v_v_shift, 8 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 2 ++.set v_x_mul, 4 ++.set v_v_shift, i_shift16 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++.if v_tmu == 0 ++.set vrx_xshift, rb_xshift2 # b side more convienient ++.set vrx_xshift_next, ra_xshift_next ++.set vra_y_next, ra_y_next ++.set vrx_base_next, ra_base_next ++.set vra_y, ra_y ++.set vra_base, ra_base ++.set vr_txs, t0s ++.else ++.set vrx_xshift, ra_xshift # a side more convienient ++.set vrx_xshift_next, rb_xshift2_next ++.set vra_y_next, ra_y2_next ++.set vrx_base_next, rb_base2_next ++.set vra_y, ra_y2 ++.set vra_base, rb_base2 ++.set vr_txs, t1s ++.endif ++ ++# denom shift values ++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) ++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) ++ ++# per-channel shifts were calculated on the *previous* invocation ++# get base addresses and per-channel shifts for *next* invocation ++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y ++ ++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base ++ ++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0 ++ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height ++ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs ++ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++ ++.if v_bit_depth <= 8 ++ shl vrx_xshift_next, r0, 3 ++ and r0, r0, -4 ++.endif ++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced! ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs ++ add vrx_base_next, r3, r0 ; mov r1, ra_height ++ ++# set up VPM write ++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight ++ add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++ add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight ++ ++# Misc final setup... ++ ++ shl r0, r1, v_dma_h_shift ; mov ra_dest, unif # ; dst_addr ++ add r0, r0, r2 ; mov r2, ra_fir_off_val # Combine width and height of destination area (r0=h<<8, r2=w*2) ++ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register ++ add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight ++ shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++ sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++ add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 # ; loop counter (V FIFO fill = 4) ++ mov rb11, ra3.8d ; mov ra_link, unif # ; Link ++ ++# r5 = -4 (loop counter) ++# ra_wt_mul_l0 = weight L0 + 128 (now unsigned) ++# rb_wt_off = (offset * 2 + 1) << (wt_den + 5) ++# rb31 = FIR value offset ++ ++# FIFO: rb4, ra5, rb6, ra7 ++# Coeffs in ra3.8a, ra3.8b, rb10, rb11 ++ ++# We want (r0r1) ++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ... ++# We fetch (after shift) ++# C0 : C3 : C1 : C4 : C2 : C5 : ... ++ ++:1 ++# retrieve texture results and pick out bytes ++# then submit two more texture requests ++ ++.if v_tmu == 0 ++ sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 ++ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++.else ++ sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 ++ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next # [r1 << delay] ++.endif ++ ++ add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++ min r3, r3, rb_max_y ; mov.ifnc r0, r2 ++ ++ and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++.if v_tmu == 0 ++ add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask # ; mask bytes ++.else ++ add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax # ; mask bytes ++.endif ++ ++# apply horizontal filter ++# The filter coeffs for the two halves of this are the same (unlike in the ++# Y case) so it doesn't matter which ra0 we get them from ++# Also as the two halves are locked together we don't need to separate the 1st ++# r0 mul or the last r1 mul as they are valid for all QPUs ++ ++ add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++ sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++ sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++ add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++ ++# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift) ++# We would like to save the r5->r4 shift but we need a delay slot ++# for both r7 & r6 which we can't find anything to put in if we have ++# already multiplied r4 & r5! ++ brr.anyn -, r:1b ++ add r2, r2, r3 ; mul24 r0, ra7, rb10 # r6 post ++ mov ra5, rb6 ; mul24 r1, rb6, ra3.8b # r5 post ++ asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 ++# >>> .anyn 1b ++ ++ add r1, r1, r0 ; mul24 r0, rb4, ra3.8a # [ra7 delay] ++ sub r1, r1, r0 ; mul24 r0, ra7, rb11 ++ sub r1, r1, r0 ++ ++ asr r1, r1, 6 ; mov r3, ra_blk_height # ; NxtLoop ++ sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++ add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++ sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop ++ brr.anyn -, r:1b ++ asr r1, r1, i_wt_den_p6 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop ++# >>> .anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc ra_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ brr -, r:1b ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_c_p ++ m_filter_c_p 0, 8 ++ ++::mc_filter_c_p_l1 ++ m_filter_c_p 1, 8 ++ ++################################################################################ ++# ++# mc_filter_c_b ++# ++# typedef struct qpu_mc_pred_c_b_s { ++# int16_t y; ++# int16_t x; ++# uint32_t base; ++# uint16_t h; ++# uint16_t w; ++# uint32_t coeffs_x1; ++# uint32_t coeffs_y1; ++# int16_t weight_u1; ++# int16_t weight_v1; ++# int16_t y2; ++# int16_t x2; ++# uint32_t base2; ++# uint32_t coeffs_x2; ++# uint32_t coeffs_y2; ++# uint32_t wo_u2; ++# uint32_t wo_v2; ++# uint32_t dst_addr_c; ++# uint32_t next_fn; ++# } qpu_mc_pred_c_b_t; ++ ++.macro m_filter_c_b, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_v_shift, 8 ++# Shifts to get width & height in the right place in ra_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 2 ++.set v_v_shift, i_shift16 ++# Shifts to get width & height in the right place in ra_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++.set v_x_mul, (1 << v_x_shift) ++ ++# denom shift values ++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) ++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) ++ ++# per-channel shifts were calculated on the *previous* invocation ++ ++# get base addresses and per-channel shifts for *next* invocation ++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y ++ ++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base ++ ++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0 ++ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height ++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++ min r0, r0, rb_max_x ; mov ra0, unif # ; L0 H filter coeffs ++ ++.if v_bit_depth <= 8 ++ shl ra_xshift_next, r0, 3 ++.endif ++ ++ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs ++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs) ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height ++ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B ++ ++# set up VPM write ++ ++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight ++ add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++ add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight ++ ++ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2 ++ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base ++ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register ++ add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b # r0=x ++ ++# L1 - uniform layout could possibly be optimized ++ ++ shl r0, r0, v_x_shift ; mov ra1, unif # r0=x<>> .anyn 1b ++ ++ sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b # L1 ; L0 ++ sub.setf -, r5, rb_lcount ; mov r0, ra4 ++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++ add r1, r1, r0 ; mul24 r0, ra7, rb7 ++ ++ sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c # L1 ++ add r2, r2, r0 ; mul24 r0, ra11, rb11 # L1 ++ sub r2, r2, r0 ++ ++ shr r1, r1, 6 ++ shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 ++ add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 ++ add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add ++ sub r1, r1, r2 ; mov r3, ra_blk_height # ; NxtLoop ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 # ; NxtLoop ++ ++ brr.anyn -, r:1b ++ asr r1, r1, ra_wt_den_p7 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop ++# >>> .anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc ra_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ brr -, r:1b ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_c_b ++ m_filter_c_b 8 ++ ++################################################################################ ++# Exit code used by both Luma & Chroma so place between them to avoid I-cache ++# conflicts ++ ++.macro m_exit_drain ++.if PREREAD == 2 ++# Special case 2 as loop is wasteful ++ nop ; nop ; ldtmu0 ++ nop ; nop ; ldtmu1 ++ nop ; nop ; ldtmu0 ++ mov -, vw_wait ; nop ; ldtmu1 ++.else ++ mov.setf r3, PREREAD - 1 ++:1 ++ brr.anynz -, r:1b ++ nop ; nop ; ldtmu0 ++ nop ; nop ; ldtmu1 ++ sub.setf r3, r3, 1 ++ # >>> ++ mov -, vw_wait ++.endif ++.endm ++ ++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair) ++# All qpus start at the beginning and after that (group - 1) must have finished ++# before (group) can start ++# ++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain ++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important - ++# lockup otherwise) ++# ++# There is some, currently ill defined, potential lockup if we have the VDM active ++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ?? ++# ++# The code stalled when I had many waiters on a single sem so we have a ++# "ripple" of srels to restart. Unsure why, may have been bug, but this works ++# and we currently have both the memory & sems to support it. ++.macro m_sync_q, n_qpu, n_quads ++# Do not generate code for qpu >= quads * 4 - fns should never be called ++.if n_qpu < n_quads * 4 ++ mov ra_link, unif # Can only branch to an a reg (not r0) ++ mov -, vw_wait # [ra_link delay] ++ ++.set n_sem_sync, n_qpu - (n_qpu % 4) ++.set n_sem_in, n_qpu ++.set n_sem_out, n_qpu + 1 ++ ++.if n_qpu % 4 == 0 ++ ++.set n_sem_quad_in, 12 + n_qpu / 4 ++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads) ++ ++ sacq -, n_sem_sync ++ sacq -, n_sem_sync ++ sacq -, n_sem_sync ++ bra -, ra_link ++ sacq -, n_sem_quad_in ++ srel -, n_sem_out ++ srel -, n_sem_quad_out ++ ++.else ++ bra -, ra_link ++ srel -, n_sem_sync ++ sacq -, n_sem_in ++.if n_sem_out % 4 != 0 ++ srel -, n_sem_out ++.else ++ nop ++.endif ++.endif ++.endif ++.endm ++ ++.set v_quads8, N_QPU_8 / 4 ++ ++::mc_sync_q0 ++ m_sync_q 0, v_quads8 ++::mc_sync_q1 ++ m_sync_q 1, v_quads8 ++::mc_sync_q2 ++ m_sync_q 2, v_quads8 ++::mc_sync_q3 ++ m_sync_q 3, v_quads8 ++::mc_sync_q4 ++ m_sync_q 4, v_quads8 ++::mc_sync_q5 ++ m_sync_q 5, v_quads8 ++::mc_sync_q6 ++ m_sync_q 6, v_quads8 ++::mc_sync_q7 ++ m_sync_q 7, v_quads8 ++::mc_sync_q8 ++ m_sync_q 8, v_quads8 ++::mc_sync_q9 ++ m_sync_q 9, v_quads8 ++::mc_sync_q10 ++ m_sync_q 10, v_quads8 ++::mc_sync_q11 ++ m_sync_q 11, v_quads8 ++ ++# mc_exit() ++# Chroma & Luma the same now ++ ++.macro m_exit_qn ++ m_exit_drain ++ nop ; nop ; thrend ++ nop ++ nop ++# >>> thrend <<< ++.endm ++ ++::mc_exit_c_qn ++::mc_exit_y_qn ++ m_exit_qn ++ ++ ++ ++# mc_interrupt_exit12() ++ ++.macro m_exit_q0 ++ m_exit_drain ++ sacq -, 12 ++ nop ; nop ; thrend ++ mov interrupt, 1 ++ nop ++# >>> thrend <<< ++.endm ++ ++::mc_exit_c_q0 ++::mc_exit_y_q0 ++ m_exit_q0 ++ ++# LUMA CODE ++ ++# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1. ++# For P frames we make the second x,y coordinates offset by +8 ++ ++ ++################################################################################ ++# mc_setup ++# ++# typedef struct qpu_mc_pred_y_s_s { ++# qpu_mc_src_t next_src1; ++# qpu_mc_src_t next_src2; ++# uint16_t pic_h; ++# uint16_t pic_w; ++# uint32_t stride2; ++# uint32_t stride1; ++# uint32_t wdenom; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_s_t; ++ ++.macro m_setup_y, v_bit_depth ++ ++# Cannot use mul24 on x as x might be -ve, so must use shift ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_pmask, 0xff ++.set v_blk_height, Y_BLK_HEIGHT_8 ++.else ++.set v_x_shift, 1 ++.set v_pmask, 0xffff ++.set v_blk_height, Y_BLK_HEIGHT_16 ++.endif ++ ++ ++ # Need to save these because we need to know the frame dimensions before computing texture coordinates ++ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y ++ mov ra9, unif # ref_y_base ++ mov ra1, unif # x2_y2 ++ ++ ++# load constants ++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++ shl rb_ef, r0, i_shift30 ; mov ra11, unif # ; ref_y2_base ++ ++ mov ra_kff800100, 0xff800100 ++ mov rb_pmask, v_pmask ++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++ mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++ mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++ mov rb_y_coeffs_2, 0x050b0a00 ++ mov rb_y_coeffs_3, 0x11283a40 ++ mov rb_y_coeffs_5, 0x0a0b0500 ++ ++# Compute part of VPM to use ++ ++# Read image dimensions ++ mov ra3, unif # width_height ++ mov ra_ef, rb_ef ; mov rb_xpitch, unif # [ra3 delay] ; stride2 ++.if v_x_shift == 0 ++ sub rb_max_x, ra3.16b, 1 ++.else ++ sub r0, ra3.16b, 1 ++ shl rb_max_x, r0, v_x_shift ++.endif ++ sub rb_max_y, ra3.16a, 1 ++ mov r3, elem_num ; mov rb_pitch, unif # stride1 ++ ++# get destination pitch ++ mov r1, vdw_setup_1(0) # [rb_pitch delay] ++ or rb_dma1_base, r1, rb_pitch ++ ++# Compute base address for first and second access ++ add r0, ra0.16b, r3 # Load x + elem_num ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, 0 ++ min r0, r0, rb_max_x ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ ++# X is byte offset - we can only load words - mask ++ ++ and r0, r0, -4 ; v8subs r2, r2, r2 ++ sub r2, r2, rb_pitch ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 # Add stripe offsets ++ add ra_base, ra9, r0 ++ ++ # r3 still contains elem_num ++ add r0, ra1.16b, r3 # Load x ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, 0 ++ min r0, r0, rb_max_x ++ shl rb_xshift2_next, r0, 3 # Compute shifts ++ ++ # r2 still contains mask ++ and r0, r0, -4 ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 # Add stripe offsets ++ add rb_base2, ra11, r0 ++ ++# Do preloads ++ nop ; mov r0, ra0.16a # ; r0 = y ++ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2 ++ ++:1 ++ sub.setf r3, r3, 1 ++ max r1, r0, 0 ++ min r1, r1, rb_max_y ++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t0s, ra_base, r1 ; mov ra_y, r0 ++ ++ max r1, r2, 0 ++ brr.anynz -, r:1b ++ min r1, r1, rb_max_y ++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t1s, rb_base2, r1 ; mov ra_y2, r2 ++# >>> .anynz 1b ++ ++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base ++ ++ mov ra_link, unif # Next fn ++ ++# touch vertical context to keep simulator happy ++ mov ra8, 0 ; mov rb8, 0 # [ra_link delay] ++ bra -, ra_link ++ mov ra9, 0 ; mov rb9, 0 ++ mov ra10, 0 ; mov rb10, 0 ++ mov ra11, 0 ; mov rb11, 0 ++# >>> ra_link ++.endm ++ ++::mc_setup_y_q0 ++ m_setup_q0 ++::mc_setup_y_qn ++ m_setup_y 8 ++ ++################################################################################ ++# ++# Start of per-block setup code ++# P and B blocks share the same setup code to save on Icache space ++ ++# get base addresses and per-channel shifts for *next* invocation ++# per-channel shifts were calculated on the *previous* invocation ++ ++# 1st 3 instructions of per_block-setup in branch delay ++# ++# typedef struct qpu_mc_pred_y_p_s { ++# qpu_mc_src_t next_src1; ++# qpu_mc_src_t next_src2; ++# uint16_t h; ++# uint16_t w; ++# uint32_t mymx21; ++# uint32_t wo1; ++# uint32_t wo2; ++# uint32_t dst_addr; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_p_t; ++# ++ ++.macro m_luma_setup, v_bit_depth ++# Hack - QASM may well have have label pasting but I have no idea how... ++.if v_bit_depth == 8 ++ brr ra_link, r:per_block_setup_8 ++.elif v_bit_depth == 10 ++ brr ra_link, r:per_block_setup_10 ++.endif ++ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? ++ add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0 ++ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++.endm ++ ++.macro m_per_block_setup, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_x_mul, 1 ++# Shifts to get width & height in the right place in ra_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++# Shifts to get width & height in the right place in ra_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++ min r0, r0, rb_max_x ++ ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ++ sub r2, r5, rb_pitch ; mov ra_base_next, unif # ; src1.base ++ and r1, r0, r2 ; mov ra_y_next, ra0.16a ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y ++ add ra_base_next, ra_base_next, r0 # [ra1 delay] ++ ++ add r0, ra1.16b, r3 # Load x2 ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base ++ shl rb_xshift2_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height ++ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes ++ add rb_base2_next, rb_base2_next, r0 ++ ++# get width,height of block (unif load above), r1 = width * pel_size ++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width) ++ add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height ++ add rb_lcount, r0, (7-8) ++ shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add # ; r3 return val ++ add r0, r0, r1 # Combine width and height of destination area ++ shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val ++ add ra_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets ++ ++# get filter coefficients and discard unused B frame values ++ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight ++ shl ra8, r0, 3 ; mov rb5, ra_k255 ++ ++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8) ++ ++# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val ++# but I can't see a way of doing that that is cheap enough to be worth it ++ ++# Picked out in a slightly random order to space out uniform loads ++ ++ # 1 ++ mov r1, 0x01040400 # [ra8 delay] ++ ror ra2.8b, r1, ra8.8d ++ ror ra0.8b, r1, ra8.8c ++ # 2 ++ ror ra2.8c, rb_y_coeffs_2, ra8.8d ++ ror ra0.8c, rb_y_coeffs_2, ra8.8c ++ # 0 ++ mov r1,0x00010100 # -ve [ra8 delay] ++ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif # ; L1 Wt/Offset ++ ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 ++ # 7 ++ shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000 ++ ror r0, r1, ra8.8d ; mov ra_dest, unif # ; Destination address ++ ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 ++ # 3 ++ ror ra2.8d, rb_y_coeffs_3, ra8.8d ++ ror ra0.8d, rb_y_coeffs_3, ra8.8c ++ # 5 ++ ror ra3.8b, rb_y_coeffs_5, ra8.8d ++ ror ra1.8b, rb_y_coeffs_5, ra8.8c ++ # 6 ++ mov r1,0x04040100 ++ ror ra3.8c, r1, ra8.8d ++ ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 # ; r5 return val ++ ++ bra -, ra_link ++ # 4 ++ mov r1,0x3a281100 ++ ror r0, r1, ra8.8d ; mov ra_link, unif # ; link - load after we've used its previous val ++ ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 ++# >>> branch ra_link ++ ++# r5 = -8 ++# r2 = fir_off_val ++# r3 = 128 ++.endm ++ ++:per_block_setup_8 ++ m_per_block_setup 8 ++ ++ ++ ++################################################################################ ++# ++# mc_filter_y_pxx ++# ++# Setup (& therefore uniform struct) shared with _bxx ++# Struct in m_luma_setup ++# ++# We can have 2 separate P reqs here as long as they mate to generate a ++# rectangular output block (i.e. h0 = h1, w0 = 8) ++# ++# At this point we have already issued PREREAD pairs of texture requests for the current block ++ ++.macro m_filter_y_pxx, v_bit_depth ++ ++# denom shift values ++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) ++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) ++ ++ m_luma_setup v_bit_depth ++ ++ shl r1, ra_wt_off_l0, i_wt_den_p5 ++ add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul ++ sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 ++ ++# retrieve texture results and pick out bytes ++# then submit two more texture requests ++ ++# This loop is identical to the B loop from here ---> ++:1 ++ add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++ ++ max r2, ra_y, 0 ; mov r1, 0 ++ min r2, r2, rb_max_y ; mov r3, ra_k1 ++ add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++ add t0s, ra_base, r2 ; mov rb5, rb6 ++ shr r0, r4, ra_xshift ; mov rb6, rb7 ++ ++ max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes ++ shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++ min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++ add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++ add t1s, rb_base2, r2 ; mov ra8, ra9 ++ ++# apply horizontal filter ++ add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++ sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++ ++ brr.anyn -, r:1b ++ sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++ mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++ asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++ # >>> .anyn 1b (r5 + r5) ++ ++ # apply vertical filter and write to VPM ++ # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11 ++ ++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++ sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++ add r1, r1, r0 ; mul24 r0, ra8, rb8 ++ add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++ add r1, r1, r0 ; mul24 r0, ra11, rb11 ++# <--- to here ++ sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height ++ sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next ++ sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next ++ ++ asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next ++ sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++ add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++ sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate) ++ ++ brr.anyn -, r:1b ++ asr r1, r1, i_wt_den_p6 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop ++# >>> branch.anyn 1b (r5 - rb_lcount) ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc ra_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ brr -, r:1b ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_y_pxx ++ m_filter_y_pxx 8 ++ ++ ++################################################################################ ++ ++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) ++# ++# Setup (& therefore uniform struct) shared with _pxx ++# Struct in m_luma_setup ++# ++# l0 calc in els 0-7, L1 in 8-15 ++# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh) ++# ++# At this point we have already issued PREREAD pairs of texture requests for the current block ++ ++.macro m_filter_y_bxx, v_bit_depth ++ ++# denom shift values ++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) ++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) ++ ++ m_luma_setup v_bit_depth ++ ++ shl r1, ra_wt_off_l0, i_wt_den_p6 ++ add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++ sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 ++ sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 ++ ++# This loop is identical to the P loop from here ---> ++:1 ++ add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++ ++ max r2, ra_y, 0 ; mov r1, 0 ++ min r2, r2, rb_max_y ; mov r3, ra_k1 ++ add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++ add t0s, ra_base, r2 ; mov rb5, rb6 ++ shr r0, r4, ra_xshift ; mov rb6, rb7 ++ ++ max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes ++ shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++ min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++ add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++ add t1s, rb_base2, r2 ; mov ra8, ra9 ++ ++# apply horizontal filter ++ add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++ sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++ ++ brr.anyn -, r:1b ++ sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++ mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++ asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++ # >>> .anyn 1b (r5 + r5) ++ ++ # apply vertical filter and write to VPM ++ # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11 ++ ++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++ sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++ add r1, r1, r0 ; mul24 r0, ra8, rb8 ++ add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++ add r1, r1, r0 ; mul24 r0, ra11, rb11 ++# <--- to here ++ sub r1, r1, ra4 ++ sub r1, r1, r0 ; mov r2, rb_wt_off ++ ++ asr r1, r1, 6 ++ sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 ++ mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add ++ sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next ++ sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next ++ add r1, r1, r2 ; mov r0, r1 << 8 ++ add r1, r1, r0 ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height ++ ++ brr.anyn -, r:1b ++ asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch # ; NxtLoop ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, 0 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate) ++# >>> branch.anyn 1b (r5 - rb_lcount) ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height ++ ++# If looping again then we consumed block_height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc ra_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link (ra_height - remaining height) ++ ++# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ brr -, r:1b ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_y_bxx ++ m_filter_y_bxx 8 ++ ++################################################################################ ++# ++# typedef struct qpu_mc_pred_y_p00_s { ++# qpu_mc_src_t next_src1; ++# uint16_t h; ++# uint16_t w; ++# uint32_t wo1; ++# uint32_t dst_addr; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_p00_t; ++ ++.macro m_filter_y_p00, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_x_mul, 1 ++# Shifts to get width & height in the right place in ra_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++# Shifts to get width & height in the right place in ra_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++ mov ra0, unif ; mov r0, elem_num # y_x ++ mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 # [ra0 delay] ; r5 = 0 ++ add r0, ra0.16b, r0 ; mov ra_base_next, unif # ; src1.base ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ ++ max r0, r0, r5 ; mov ra_y_next, ra0.16a # ; width_height ++ min r0, r0, rb_max_x ; mov ra_width_height, unif ++ ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ++ sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif # ; weight_offset ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra_dest, unif # Add stripe offsets ; dest addr ++ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write ++ ++# get width,height of block (unif load above) ++# Compute vdw_setup1(dst_pitch-width) ++ shl r1, ra_width, v_x_shift ++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++ add r0, r0, r1 # Combine width and height of destination area ++ shl rb_wt_off, ra_wt_off_l0, DENOM + 7 ++ shl r0, r0, v_dma_wh_shift ; mov ra_link, unif # Shift into bits 16 upwards of the vdw_setup0 register ; link ++ add ra_dma0, r0, rb_dma0_base ++ ++:1 ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ ++ max r2, ra_y, 0 # y ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++ ++ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++ shl r1, r1, 8 ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ ++ brr.anyn -, r:1b ++ asr r1, r1, DENOM + 8 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> branch.anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc ra_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ brr -, r:1b ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_y_p00 ++ m_filter_y_p00 8 ++ ++################################################################################ ++ ++.macro m_filter_y_b00, v_bit_depth ++# luma setup does a fair bit more than we need calculating filter coeffs ++# that we will never use but it saves I-cache to use it (also simple!) ++ m_luma_setup v_bit_depth ++ ++# Fix up vals that were expecting a filter (somewhat icky) ++ mov r2, 1 ++ add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 # Need in rX rather than raX for <<8 to do what we want ++ shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero ++ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++ ++:1 ++ sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ ++ max r2, ra_y, 0 # y ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next ++ ++ max r2, ra_y2, 0 ++ min r2, r2, rb_max_y ++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++ add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax # v8subs masks out all but bottom byte ++ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++ ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++ add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 ++ ++ shl r1, r1, 8 ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ ++ brr.anyn -, r:1b ++ asr r1, r1, (DENOM + 9) - 32 # -32 to get valid shift immediate ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> branch.anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc ra_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ brr -, r:1b ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_y_b00 ++ m_filter_y_b00 8 ++ ++################################################################################ ++################################################################################ ++# 10 BIT ++ ++::mc_setup_c10_q0 ++ m_setup_q0 ++::mc_setup_c10_qn ++ m_setup_c 10 ++ ++::mc_filter_c10_p ++ m_filter_c_p 0, 10 ++ ++::mc_filter_c10_p_l1 ++ m_filter_c_p 1, 10 ++ ++ ++::mc_filter_c10_b ++ m_filter_c_b 10 ++ ++# Even if these fns are the same as for other bit depths we want our own copy ++# to keep the code we are using in a single lump to avoid (direct map) cache ++# thrashing ++.set v_quads10, N_QPU_16 / 4 ++ ++::mc_sync10_q0 ++ m_sync_q 0, v_quads10 ++::mc_sync10_q1 ++ m_sync_q 1, v_quads10 ++::mc_sync10_q2 ++ m_sync_q 2, v_quads10 ++::mc_sync10_q3 ++ m_sync_q 3, v_quads10 ++::mc_sync10_q4 ++ m_sync_q 4, v_quads10 ++::mc_sync10_q5 ++ m_sync_q 5, v_quads10 ++::mc_sync10_q6 ++ m_sync_q 6, v_quads10 ++::mc_sync10_q7 ++ m_sync_q 7, v_quads10 ++::mc_sync10_q8 ++ m_sync_q 8, v_quads10 ++::mc_sync10_q9 ++ m_sync_q 9, v_quads10 ++::mc_sync10_q10 ++ m_sync_q 10, v_quads10 ++::mc_sync10_q11 ++ m_sync_q 11, v_quads10 ++ ++::mc_exit_y10_q0 ++::mc_exit_c10_q0 ++ m_exit_q0 ++ ++::mc_exit_y10_qn ++::mc_exit_c10_qn ++ m_exit_qn ++ ++::mc_setup_y10_q0 ++ m_setup_q0 ++::mc_setup_y10_qn ++ m_setup_y 10 ++ ++:per_block_setup_10 ++ m_per_block_setup 10 ++ ++::mc_filter_y10_pxx ++ m_filter_y_pxx 10 ++ ++::mc_filter_y10_p00 ++ m_filter_y_p00 10 ++ ++::mc_filter_y10_bxx ++ m_filter_y_bxx 10 ++ ++::mc_filter_y10_b00 ++ m_filter_y_b00 10 ++ ++ ++ ++::mc_end ++# Do not add code here because mc_end must appear after all other code. +diff --git a/libavcodec/rpi_hevc_shader_cmd.h b/libavcodec/rpi_hevc_shader_cmd.h +new file mode 100644 +index 0000000000..89711d776b +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader_cmd.h +@@ -0,0 +1,165 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#ifndef RPI_SHADER_CMD_H ++#define RPI_SHADER_CMD_H ++ ++#pragma pack(push, 4) ++ ++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y ++// If mixed then we are just confused and get a lot of warnings.... ++typedef const uint8_t * qpu_mc_src_addr_t; ++typedef uint8_t * qpu_mc_dst_addr_t; ++#else ++typedef uint32_t qpu_mc_src_addr_t; ++typedef uint32_t qpu_mc_dst_addr_t; ++#endif ++ ++typedef struct qpu_mc_src_s ++{ ++ int16_t y; ++ int16_t x; ++ qpu_mc_src_addr_t base; ++} qpu_mc_src_t; ++ ++ ++typedef struct qpu_mc_pred_c_p_s { ++ qpu_mc_src_t next_src; ++ uint16_t h; ++ uint16_t w; ++ uint32_t coeffs_x; ++ uint32_t coeffs_y; ++ uint32_t wo_u; ++ uint32_t wo_v; ++ qpu_mc_dst_addr_t dst_addr_c; ++ uint32_t next_fn; ++} qpu_mc_pred_c_p_t; ++ ++typedef struct qpu_mc_pred_c_b_s { ++ qpu_mc_src_t next_src1; ++ uint16_t h; ++ uint16_t w; ++ uint32_t coeffs_x1; ++ uint32_t coeffs_y1; ++ int16_t weight_u1; ++ int16_t weight_v1; ++ qpu_mc_src_t next_src2; ++ uint32_t coeffs_x2; ++ uint32_t coeffs_y2; ++ uint32_t wo_u2; ++ uint32_t wo_v2; ++ qpu_mc_dst_addr_t dst_addr_c; ++ uint32_t next_fn; ++} qpu_mc_pred_c_b_t; ++ ++typedef struct qpu_mc_pred_c_s_s { ++ qpu_mc_src_t next_src1; ++ uint32_t pic_cw; // C Width (== Y width / 2) ++ uint32_t pic_ch; // C Height (== Y Height / 2) ++ uint32_t stride2; ++ uint32_t stride1; ++ qpu_mc_src_t next_src2; ++ uint32_t next_fn; ++} qpu_mc_pred_c_s_t; ++ ++typedef struct qpu_mc_pred_c_s { ++ union { ++ qpu_mc_pred_c_p_t p; ++ qpu_mc_pred_c_b_t b; ++ qpu_mc_pred_c_s_t s; ++ }; ++} qpu_mc_pred_c_t; ++ ++ ++typedef struct qpu_mc_pred_y_p_s { ++ qpu_mc_src_t next_src1; ++ qpu_mc_src_t next_src2; ++ uint16_t h; ++ uint16_t w; ++ uint32_t mymx21; ++ uint32_t wo1; ++ uint32_t wo2; ++ qpu_mc_dst_addr_t dst_addr; ++ uint32_t next_fn; ++} qpu_mc_pred_y_p_t; ++ ++typedef struct qpu_mc_pred_y_p00_s { ++ qpu_mc_src_t next_src1; ++ uint16_t h; ++ uint16_t w; ++ uint32_t wo1; ++ qpu_mc_dst_addr_t dst_addr; ++ uint32_t next_fn; ++} qpu_mc_pred_y_p00_t; ++ ++typedef struct qpu_mc_pred_y_s_s { ++ qpu_mc_src_t next_src1; ++ qpu_mc_src_t next_src2; ++ uint16_t pic_h; ++ uint16_t pic_w; ++ uint32_t stride2; ++ uint32_t stride1; ++ uint32_t next_fn; ++} qpu_mc_pred_y_s_t; ++ ++typedef struct qpu_mc_pred_sync_s { ++ uint32_t next_fn; ++} qpu_mc_pred_sync_t; ++ ++// Only a useful structure in that it allows us to return something other than a void * ++typedef struct qpu_mc_pred_y_s { ++ union { ++ qpu_mc_pred_y_p_t p; ++ qpu_mc_pred_y_p00_t p00; ++ qpu_mc_pred_y_s_t s; ++ }; ++} qpu_mc_pred_y_t; ++ ++typedef union qpu_mc_pred_cmd_u { ++ qpu_mc_pred_y_t y; ++ qpu_mc_pred_c_t c; ++ qpu_mc_pred_sync_t sync; ++} qpu_mc_pred_cmd_t; ++ ++static void inline qpu_mc_link_set(qpu_mc_pred_cmd_t * const cmd, const uint32_t fn) ++{ ++ // Link is last el of previous cmd ++ ((uint32_t *)cmd)[-1] = fn; ++} ++ ++#define QPU_MC_PRED_N_Y8 12 ++#define QPU_MC_PRED_N_C8 12 ++ ++#define QPU_MC_PRED_N_Y10 12 ++#define QPU_MC_PRED_N_C10 12 ++ ++#define QPU_MC_DENOM 7 ++ ++#pragma pack(pop) ++ ++#endif ++ +diff --git a/libavcodec/rpi_hevc_shader_template.c b/libavcodec/rpi_hevc_shader_template.c +new file mode 100644 +index 0000000000..77d8366eb8 +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader_template.c +@@ -0,0 +1,88 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#include "hevc.h" ++#include "rpi_hevcdec.h" ++#include "libavutil/rpi_sand_fns.h" ++#include "rpi_hevc_shader_cmd.h" ++#include "rpi_hevc_shader_template.h" ++ ++typedef struct shader_track_s ++{ ++ const union qpu_mc_pred_cmd_u *qpu_mc_curr; ++ const struct qpu_mc_src_s *last_l0; ++ const struct qpu_mc_src_s *last_l1; ++ uint32_t width; // pic_width * PW ++ uint32_t height; ++ uint32_t stride2; ++ uint32_t stride1; ++} shader_track_t; ++ ++static int wtoidx(const unsigned int w) ++{ ++ static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; ++ return pel_weight[w]; ++} ++ ++static const int fctom(uint32_t x) ++{ ++ int rv; ++ // As it happens we can take the 2nd filter term & divide it by 8 ++ // (dropping fractions) to get the fractional move ++ rv = 8 - ((x >> 11) & 0xf); ++ av_assert2(rv >= 0 && rv <= 7); ++ return rv; ++} ++ ++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr) ++{ ++ return (x << shl) >> shr; ++} ++ ++static inline int woff_p(HEVCRpiContext *const s, int32_t x) ++{ ++ return ext(x, 0, 17 + s->ps.sps->bit_depth - 8); ++} ++ ++static inline int woff_b(HEVCRpiContext *const s, int32_t x) ++{ ++ return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8); ++} ++ ++static inline int wweight(int32_t x) ++{ ++ return ext(x, 16, 16); ++} ++ ++ ++#define PW 1 ++#include "rpi_hevc_shader_template_fn.h" ++ ++#undef PW ++#define PW 2 ++#include "rpi_hevc_shader_template_fn.h" ++ +diff --git a/libavcodec/rpi_hevc_shader_template.h b/libavcodec/rpi_hevc_shader_template.h +new file mode 100644 +index 0000000000..0fc5a45e9f +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader_template.h +@@ -0,0 +1,49 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H ++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H ++ ++struct HEVCRpiContext; ++struct HEVCRpiInterPredEnv; ++ ++void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s, ++ const struct HEVCRpiInterPredEnv *const ipe_y, ++ const struct HEVCRpiInterPredEnv *const ipe_c); ++ ++void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s, ++ const struct HEVCRpiInterPredEnv *const ipe_y, ++ const struct HEVCRpiInterPredEnv *const ipe_c); ++ ++void rpi_sand_dump8(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); ++ ++void rpi_sand_dump16(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); ++ ++#endif ++ +diff --git a/libavcodec/rpi_hevc_shader_template_fn.h b/libavcodec/rpi_hevc_shader_template_fn.h +new file mode 100644 +index 0000000000..10c163a4b9 +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader_template_fn.h +@@ -0,0 +1,502 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#define STRCAT(x,y) x##y ++ ++#if PW == 1 ++#define pixel uint8_t ++#define FUNC(f) STRCAT(f, 8) ++#elif PW == 2 ++#define pixel uint16_t ++#define FUNC(f) STRCAT(f, 16) ++#else ++#error Unexpected PW ++#endif ++ ++#define PATCH_STRIDE (16 * PW) ++ ++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) ++{ ++ for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) { ++ const pixel s = *(const pixel *)src; ++ pixel * d = (pixel *)dst; ++ for (unsigned int j = 0; j < w; j += PW) { ++ *d++ = s; ++ } ++ } ++} ++ ++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) ++{ ++ for (unsigned int i = 0; i != h; ++i, dst += stride) { ++ memcpy(dst, src, w); ++ } ++} ++ ++static void FUNC(get_patch_y)(const shader_track_t * const st, ++ uint8_t * dst, const unsigned int dst_stride, ++ const qpu_mc_src_t *src, ++ unsigned int _w, unsigned int _h) ++{ ++ int x = src->x * PW; ++ int y = src->y; ++ int w = _w * PW; ++ int h = _h; ++ int dl = 0; ++ int dr = 0; ++ int dt = 0; ++ int db = 0; ++ ++ if (x < 0) { ++ if (-x >= w) ++ x = PW - w; ++ dl = -x; ++ w += x; ++ x = 0; ++ } ++ if (x + w > st->width) { ++ if (x >= st->width) ++ x = st->width - PW; ++ dr = (x + w) - st->width; ++ w = st->width - x; ++ } ++ ++ // Y ++ if (y < 0) { ++ if (-y >= h) ++ y = 1 - h; ++ dt = -y; ++ h += y; ++ y = 0; ++ } ++ if (y + h > st->height) { ++ if (y >= st->height) ++ y = st->height - 1; ++ db = (y + h) - st->height; ++ h = st->height - y; ++ } ++ ++ dst += dl + dt * dst_stride; ++ FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); ++ ++ // Edge dup ++ if (dl != 0) ++ FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride); ++ if (dr != 0) ++ FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride); ++ w += dl + dr; ++ dst -= dl; ++ ++ if (dt != 0) ++ FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride); ++ if (db != 0) ++ FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride); ++} ++ ++ ++ ++static void FUNC(get_patch_c)(const shader_track_t * const st, ++ uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride, ++ const qpu_mc_src_t *src, ++ unsigned int _w, unsigned int _h) ++{ ++ int x = src->x * PW; ++ int y = src->y; ++ int w = _w * PW; ++ int h = _h; ++ int dl = 0; ++ int dr = 0; ++ int dt = 0; ++ int db = 0; ++ const int width = st->width; ++ const int height = st->height; ++ ++ if (x < 0) { ++ if (-x >= w) ++ x = PW - w; ++ dl = -x; ++ w += x; ++ x = 0; ++ } ++ if (x + w > width) { ++ if (x >= width) ++ x = width - PW; ++ dr = (x + w) - width; ++ w = width - x; ++ } ++ ++ // Y ++ if (y < 0) { ++ if (-y >= h) ++ y = 1 - h; ++ dt = -y; ++ h += y; ++ y = 0; ++ } ++ if (y + h > height) { ++ if (y >= height) ++ y = height - 1; ++ db = (y + h) - height; ++ h = height - y; ++ } ++ ++ dst_u += dl + dt * dst_stride; ++ dst_v += dl + dt * dst_stride; ++ FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); ++ ++ // Edge dup ++ if (dl != 0) ++ { ++ FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride); ++ FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride); ++ } ++ if (dr != 0) ++ { ++ FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride); ++ FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride); ++ } ++ w += dl + dr; ++ dst_u -= dl; ++ dst_v -= dl; ++ ++ if (dt != 0) ++ { ++ FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride); ++ FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride); ++ } ++ if (db != 0) ++ { ++ FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride); ++ FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride); ++ } ++} ++ ++// w, y, w, h in pixels ++// stride1, stride2 in bytes ++void FUNC(rpi_sand_dump)(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c) ++{ ++ const int mask = stride2 == 0 ? ~0 : stride1 - 1; ++ ++ printf("%s (%d,%d) %dx%d\n", name, x, y, w, h); ++ ++ if (is_c) { ++ x *= 2; ++ w *= 2; ++ } ++ ++ for (int i = y; i != y + h; ++i) { ++ for (int j = x; j != x + w; ++j) { ++ const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2; ++ char sep = is_c && (j & 1) == 0 ? ':' : ' '; ++#if PW == 1 ++ if (j < 0 || i < 0) ++ printf("..%c", sep); ++ else ++ printf("%02x%c", *(const pixel*)p, sep); ++#else ++ if (j < 0 || i < 0) ++ printf("...%c", sep); ++ else ++ printf("%03x%c", *(const pixel*)p, sep); ++#endif ++ } ++ printf("\n"); ++ } ++} ++ ++ ++void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s, ++ const HEVCRpiInterPredEnv *const ipe_y, ++ const HEVCRpiInterPredEnv *const ipe_c) ++{ ++ for (int c_idx = 0; c_idx < 2; ++c_idx) ++ { ++ const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c; ++ shader_track_t tracka[QPU_N_MAX] = {{NULL}}; ++ unsigned int exit_n = 0; ++ ++ if (ipe == NULL || !ipe->used) { ++ continue; ++ } ++ ++ do { ++ for (unsigned int i = 0; i != ipe->n; ++i) { ++ const HEVCRpiInterPredQ * const q = ipe->q + i; ++ shader_track_t * const st = tracka + i; ++ const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr; ++ ++ for (;;) { ++ const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1]; ++ ++ if (link == q->code_setup) { ++ if (c_idx == 0) { ++ // Luma ++ const qpu_mc_pred_y_s_t *const c = &cmd->y.s; ++ ++ st->height = c->pic_h; ++ st->width = c->pic_w * PW; ++ st->stride1 = c->stride1; ++ st->stride2 = c->stride2; ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else { ++ // Chroma ++ const qpu_mc_pred_c_s_t *const c = &cmd->c.s; ++ ++ st->height = c->pic_ch; ++ st->width = c->pic_cw * PW; ++ st->stride1 = c->stride1; ++ st->stride2 = c->stride2; ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ } ++ else if (link == s->qpu.y_pxx) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ const int w1 = FFMIN(c->w, 8); ++ const int w2 = c->w - w1; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ if (w2 > 0) { ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h + 7); ++ } ++ ++ // wo[offset] = offset*2+1 ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1); ++ if (w2 > 0) { ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( ++ (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2); ++ } ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_bxx) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h + 7); ++ ++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( ++ patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w); ++ ++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3, ++ c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2), ++ 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w); ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_p00) { ++ const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ ++ // wo[offset] = offset*2+1 ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE, ++ c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w); ++ ++ st->last_l0 = &c->next_src1; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_b00) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ av_assert0(c->w <= 16 && c->h <= 64); ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h); ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h); ++ ++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0]( ++ patch_y3, patch_y1, PATCH_STRIDE, ++ c->h, 0, 0, c->w); ++ ++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3, ++ c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2), ++ 0, woff_b(s, c->wo2), 0, 0, c->w); ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_pxx) { ++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p; ++ const int mx = fctom(c->coeffs_x); ++ const int my = fctom(c->coeffs_y); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l0 = &c->next_src; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_pxx_l1) { ++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p; ++ const int mx = fctom(c->coeffs_x); ++ const int my = fctom(c->coeffs_y); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l1 = &c->next_src; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_bxx) { ++ const qpu_mc_pred_c_b_t *const c = &cmd->c.b; ++ const int mx1 = fctom(c->coeffs_x1); ++ const int my1 = fctom(c->coeffs_y1); ++ const int mx2 = fctom(c->coeffs_x2); ++ const int my2 = fctom(c->coeffs_y2); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; ++ uint8_t patch_v1[PATCH_STRIDE * 72]; ++ uint8_t patch_u2[PATCH_STRIDE * 72]; ++ uint8_t patch_v2[PATCH_STRIDE * 72]; ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE]; ++ uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); ++ FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( ++ patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, mx1, my1, c->w); ++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( ++ patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, mx1, my1, c->w); ++ ++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( ++ patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4, ++ c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2), ++ 0, woff_b(s, c->wo_u2), mx2, my2, c->w); ++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( ++ patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4, ++ c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2), ++ 0, woff_b(s, c->wo_v2), mx2, my2, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == q->code_sync) { ++ cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1); ++ break; ++ } ++ else if (link == q->code_exit) { ++ // We expect exit to occur without other sync ++ av_assert0(i == exit_n); ++ ++exit_n; ++ break; ++ } ++ else { ++ av_assert0(0); ++ } ++ } ++ ++ st->qpu_mc_curr = cmd; ++ } ++ } while (exit_n == 0); ++ } ++} ++ ++#undef FUNC ++#undef pixel ++ +diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s +new file mode 100644 +index 0000000000..3caef20137 +--- /dev/null ++++ b/libavcodec/rpi_hevc_transform.s +@@ -0,0 +1,444 @@ ++# ****************************************************************************** ++# Argon Design Ltd. ++# (c) Copyright 2015 Argon Design Ltd. All rights reserved. ++# ++# Module : HEVC ++# Author : Peter de Rivaz ++# ****************************************************************************** ++ ++# USE_STACK = 1 means temporary data stored on the stack (requires build with larger stack) ++# USE_STACK = 0 means temporary data stored in fixed per-VPU data buffers (requires modifications to vasm to handle instruction encoding for PC relative instructions) ++.set USE_STACK, 0 ++ ++# Lines that fail to assemble start with #: ++# The script insert_magic_opcodes.sh inserts the machine code directly for these. ++# HEVC VPU Transform ++# ++# Transform matrix can be thought of as ++# output row vector = input row vector * transMatrix2 ++# ++# The even rows of the matrix are symmetric ++# The odd rows of the matrix are antisymmetric ++# ++# So only need to compute the first half of the results, then can compute the remainder with a butterfly ++# ++# EXAMPLE ++# (a b c d) (1 2 2 1) ++# (3 4 -4 -3) ++# (5 6 6 5) ++# (7 8 -8 -7) ++# ++# x=(a c)(1 2) = 1a+5c 2a+6c ++# (5 6) ++# ++# y=(b d)(3 4) = 3b+7d 4b+8d ++# (7 8) ++# ++# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d ++# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d ++# ++# Final results are (u , v[::-1]) ++# ++# ++# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0) ++# Apply the even matrix first and stop before rounding ++# Then apply the odd matrix in a full manner: ++# ++# First step is to compute partial products with the first input (16 cycles) ++# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output ++# 2a 4b 6c 8d ++# 2a -4b 6c -8d ++# 1a -3b 5c -7d ++# ++# Second step is to sum partial products into final position (8 cycles) ++# 1a+3b+5c+7d ++# 2a+4b+6c+8d ++# 2a-4b+6c-8d ++# 1a-3b+5c-7d ++# ++# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format) ++# ++# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds) ++# ++# For 8x8 we could compute two in parallel. ++# ++# ++ ++# Columns are transformed first ++# ++# Store top left half of transMatrix2 in ++# Store bottom left half of transMatrix2 in HX(32,32) ++# ++# For 16x16 ++# HX(0:15,0) contains input data before transform ++# HY(0:15,0) contains 32bit output data after transform ++# HX(32,0) contains even rows of left half of transMatrix2 ++# HX(32,32) contains odd rows of left half of transMatrix2 ++# HY(48,0) contains partial products ready for summing ++# ++ ++ ++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!) ++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) ++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) ++# num: number of 16x16 transforms to be done ++# coeffs32 ++# num32: number of 32x32 transforms ++# command 0 for transform, 1 for memclear16(int16_t *dst,num16) ++# ++ ++.equ TRANS_SHIFT, 20 - BIT_DEPTH ++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1) ++.equ TRANS_ASL2, 16 - TRANS_SHIFT ++ ++ ++hevc_trans_16x16: ++ push r6-r15, lr # TODO cut down number of used registers ++ mov r14,r3 # coeffs32 ++ mov r15,r4 # num32 ++ mov r3, 16*2 # Stride of transMatrix2 in bytes ++ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix ++ ++ add r0, 16*16*2 # For 32x32 transforms we also need this matrix ++ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix ++ ++ # Now use r0 to describe which matrix we are working on. ++ # Allows us to prefetch the next block of coefficients for efficiency. ++ mov r0,0 # This describes the location where we read our coefficients from ++ mov r3,16*2 # Stride of coefficients in bytes (TODO remove) ++ mov r7,16*16*2 # Total block size ++ mov r8,64*16 # Value used to swap from current to next VRF location ++ mov r4,64 # Constant used for rounding first pass ++ mov r5,TRANS_RND2 # Constant used for rounding second pass ++ ++ sub sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack ++ ++ add r11,sp,64 # Space for 32 bytes before, and rounding ++ lsr r11,5 ++ lsl r11,5 # Make sure r11 is rounded to multiple of 2**5==32 ++ ++ lsr r10, r2, 16 # Number of compressed blocks stored in top short ++ extu r2,16 ++ # At start of block r0,r1 point to the current block (that has already been loaded) ++ # r0 VRF location of current block ++ # r1 address of current block ++ # r2 number of 16*16 transforms to do ++ # r3 Stride of coefficients (==32) ++ # r4 TRANS_RND1 (64) ++ # r5 TRANS_RND2 ++ # r6 temporary used inside col_trans16 ++ # r7 16*16*2 total bytes in block ++ # r8 64*16 VRF switch locations ++ # r9 temporary in unpack_coeff for index ++ # r10 number of 16x16 transforms using compression ++ # r11 unpacked data buffer (16*16 shorts) (preceded by 16 shorts of packed data buffer) ++ # r12 temporary counter in unpack_coeff ++ # r13 ++ # r14 Save information for 32 bit transform (coeffs location) ++ # r15 Save information for 32 bit transform (number of transforms) ++ cmp r2,0 ++ beq done16x16s ++block_loop: ++ # With compressed coefficients, we don't use prefetch as we don't want to issue unnecessary memory requests ++ cmp r10,0 ++ mov r6, r1 ++ beq not_compressed ++ sub r10, 1 ++ bl unpack16x16 ++not_compressed: ++ #mov r6,r1 # DEBUG without compress ++ vldh HX(0++,0)+r0,(r6 += r3) REP 16 ++ #eor r0,r8 ++ #add r1,r7 ++ # Prefetch the next block ++ #bl unpack16x16 ++ #vldh HX(0++,0)+r0,(r6 += r3) REP 16 ++ #vmov HX(0++,0)+r0,0 REP 16 # DEBUG ++ #eor r0,r8 ++ #sub r1,r7 ++ ++ # Transform the current block ++ bl col_trans_16 ++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate ++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word. ++ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble? ++ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position ++ ++ bl col_trans_16 ++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate ++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word. ++ vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag) ++ ++ # Save results - note there has been a transposition during the processing so we save columns ++ vsth VX(0,32++)+r0, (r1 += r3) REP 16 ++ ++ # Move onto next block ++ eor r0,r8 ++ add r1,r7 ++ ++ addcmpbgt r2,-1,0,block_loop ++done16x16s: ++ ++ add sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack ++ # Now go and do any 32x32 transforms ++ b hevc_trans_32x32 ++ ++ pop r6-r15, pc ++# This returns a value in r6 that says where to load the data from. ++# We load data 16 shorts at a time from memory (uncached), and store to stack space to allow us to process it. ++unpack16x16: ++# Clear out destination ++ vmov HX(0,0)+r0,0 ++ mov r6, r11 ++ vsth HX(0,0)+r0,(r6 += r3) REP 16 ++ mov r5, r1 # Moving pointer to input coefficients ++unpack_outer_loop: ++ # Loop until we find the end ++ vldh HX(0,0)+r0,(r5) # TODO would prefetch help here while unpacking previous? ++ sub r6,r11,32 ++ #add r6,pc,packed_data-$ # Packed data ++ vsth HX(0,0)+r0,(r6) # Store into packed data ++ mov r12,0 ++unpack_loop: ++ ld r4,(r6) ++ add r6,r6,4 ++ lsr r9,r4,16 # r9 is destination value ++ cmp r4,0 # {value,index} ++ extu r4,8 ++ beq done_unpack ++ sth r9,(r11, r4) ++ addcmpblt r12,1,8,unpack_loop ++# # Read next 16 ++ add r5,32 ++ b unpack_outer_loop ++done_unpack: ++# # Set new load location ++ mov r6, r11 ++ #add r6,pc,unpacked_data-$ ++# # Restore constants ++ mov r4,64 ++ mov r5,TRANS_RND2 ++# pop r6-r15, pc ++ b lr ++ ++# r1,r2,r3 r7,r8 should be preserved ++# HX(0++,0)+r0 is the block to be transformed ++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients ++# Use HY(48,0) for intermediate results ++# r0 can be used, but should be returned to its original value at the end ++col_trans_16: ++ add r6,r0,16 # Final value for this loop ++col_trans_16_loop: ++ # First compute partial products for a single column ++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16 ++ # Then sum up the results and place back ++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC ++ addcmpblt r0,1,r6,col_trans_16_loop ++ sub r0,16 # put r0 back to its original value ++ b lr ++ ++col_trans_odd_16: ++ add r6,r0,16 # Final value for this loop ++col_trans_odd_16_loop: ++ # First compute partial products for a single column ++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16 ++ # Then sum up the results and place back ++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC ++ addcmpblt r0,1,r6,col_trans_odd_16_loop ++ sub r0,16 # put r0 back to its original value ++ b lr ++ ++# r1/r10 input pointer ++# r0,r4,r5,r6 free ++# r8/r9 output storage ++# ++# Store packed coefficients at r9-32 ++# Store unpacked at r9+32*32 (because transform works on even/odd rows on input, but writes all rows) ++unpack32x32: ++# Clear out destination ++ vmov HX(0,0),0 ++ add r0, r9, 32*32*2 # Unpacked buffer ++ mov r4, 32 ++ vsth HX(0,0),(r0 += r4) REP 64 ++unpack_outer_loop32: ++ # Loop until we find the end ++ vldh HX(0,0),(r1) # TODO would prefetch help here while unpacking previous? ++ sub r6,r9,32 ++ #add r6,pc,packed_data-$ # Packed data ++ vsth HX(0,0),(r6) # Store into packed data ++ mov r8,0 ++unpack_loop32: ++ ld r4,(r6) ++ add r6,r6,4 ++ lsr r5,r4,16 # r5 is destination value ++ cmp r4,0 # {value,index} ++ extu r4,10 ++ beq done_unpack ++ sth r5,(r0, r4) ++ addcmpblt r8,1,8,unpack_loop32 ++# # Read next 16 ++ add r1,32 ++ b unpack_outer_loop32 ++done_unpack32: ++ b lr ++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num) ++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd ++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) ++# num: number of 16x16 transforms to be done in low 16, number of packed in high 16 ++# ++# Note that the 32x32 transforms are stored in reverse order, this means that the unpacked ones appear first! ++hevc_trans_32x32: ++ mov r1,r14 # coeffs ++ mov r2,r15 # num ++ lsr r15,r15,16 # Number that are packed ++ extu r2,16 # Total number ++ ++ # Fetch odd transform matrix ++ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients) ++ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix ++ #add r0, 16*16*2 ++ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix ++ ++ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer ++ mov r7, 16*16*2 # Total block size ++ ++.if USE_STACK ++ # Stack base allocation ++ sub sp,sp,32*32*4+64 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) and another 32*32 for unpacking ++ # set r8 to 32byte aligned stack pointer with 32 bytes of space before it ++ add r8,sp,63 ++ lsr r8,5 ++ lsl r8,5 ++.else ++#:version r8 ++ .half 0x00e8 #AUTOINSERTED ++ btst r8,16 ++#:add r8,pc,intermediate_results-$ ++ .half 0xbfe8 ++ .half intermediate_results-($-2) ++ beq on_vpu1 ++ add r8,r8,32*32*2*2+16*2 # Move to secondary storage ++on_vpu1: ++.endif ++ mov r9,r8 # Backup of the temporary storage ++ mov r10,r1 # Backup of the coefficient buffer ++ ++ cmp r2,0 ++ beq done32x32s ++block_loop32: ++ ++ # Transform the first 16 columns ++ mov r1,r10 # Input Coefficient buffer ++ mov r8,r9 # Output temporary storage ++ # Unpacked are first, so need to only do unpacking when r2(=num left) <= r15 (=num packed) ++ cmp r2,r15 ++ bgt not_compressed_32 ++ bl unpack32x32 ++ add r1,r9,32*32*2 # Uncompressed into temporary storage ++ mov r8,r9 # Transform into here ++not_compressed_32: ++ # COLUMN TRANSFORM ++ mov r4, 64 # Constant used for rounding first pass ++ mov r5, 9 # left shift used for rounding first pass ++ ++ bl trans32 ++ # Transform the second 16 columns ++ add r8,32*16*2 ++ add r1,32 ++ bl trans32 ++ ++ # ROW TRANSFORM ++ mov r4, TRANS_RND2 # Constant used for rounding second pass ++ mov r5, TRANS_ASL2 # left shift used for rounding second pass ++ ++ mov r1,r9 # Input temporary storage ++ mov r8,r10 # Output Coefficient buffer ++ bl trans32 ++ # Transform the second 16 columns ++ add r8,32*16*2 ++ add r1,32 ++ bl trans32 ++ ++ add r10, 32*32*2 # move onto next block of coefficients ++ addcmpbgt r2,-1,0,block_loop32 ++done32x32s: ++ ++.if USE_STACK ++ add sp,sp,32*32*4+64# Restore stack ++.endif ++ ++ pop r6-r15, pc ++ ++trans32: ++ push lr ++ # We can no longer afford the VRF space to do prefetching when doing 32x32 ++ # Fetch the even rows ++ vldh HX(0++,0),(r1 += r3) REP 16 ++ # Fetch the odd rows ++ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1 ++ ++ # Transform the even rows using even matrix ++ mov r0, 0 # Even rows ++ bl col_trans_16 ++ ++ # Now transform the odd rows using odd matrix ++ mov r0, 64*16 # Odd rows ++ bl col_trans_odd_16 ++ ++ # Now apply butterfly to compute the first 16 results ++ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16 ++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, ++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate ++ # 16bit results now in HX(48,32) ++ mov r0,r8 ++ mov r6,32*2 ++ vsth VX(48,32++),(r0+=r6) REP 16 ++ ++ # Now apply butterfly to compute the second 16 results (in reverse order) ++ vsub HY(63,0),HY(0 ,0),HY(16,0) ++ vsub HY(62,0),HY(1 ,0),HY(17,0) ++ vsub HY(61,0),HY(2 ,0),HY(18,0) ++ vsub HY(60,0),HY(3 ,0),HY(19,0) ++ vsub HY(59,0),HY(4 ,0),HY(20,0) ++ vsub HY(58,0),HY(5 ,0),HY(21,0) ++ vsub HY(57,0),HY(6 ,0),HY(22,0) ++ vsub HY(56,0),HY(7 ,0),HY(23,0) ++ vsub HY(55,0),HY(8 ,0),HY(24,0) ++ vsub HY(54,0),HY(9 ,0),HY(25,0) ++ vsub HY(53,0),HY(10,0),HY(26,0) ++ vsub HY(52,0),HY(11,0),HY(27,0) ++ vsub HY(51,0),HY(12,0),HY(28,0) ++ vsub HY(50,0),HY(13,0),HY(29,0) ++ vsub HY(49,0),HY(14,0),HY(30,0) ++ vsub HY(48,0),HY(15,0),HY(31,0) ++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, ++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate ++ add r0,r8,32 ++ vsth VX(48,32++),(r0+=r6) REP 16 ++ pop pc ++ ++.if USE_STACK == 0 ++ .balign 32 ++ ++# .space directives generate 0's in the bin so avoid unnecessary padding by ++# just setting to appropriate value ++.equ intermediate_results, $+16*2 ++ ++# Layout goes: ++# ++#packed_buffer: ++# .space 16*2 ++#intermediate_results: ++# .space 32*32*2 ++#unpacked_buffer: ++# .space 32*32*2 ++# ++#packed_buffer2: ++# .space 16*2 ++#intermediate_results2: ++# .space 32*32*2 ++#unpacked_buffer2: ++# .space 32*32*2 ++.endif ++ ++ +diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h +new file mode 100644 +index 0000000000..1c364492d0 +--- /dev/null ++++ b/libavcodec/rpi_hevc_transform10.h +@@ -0,0 +1,94 @@ ++static const unsigned char rpi_hevc_transform10 [] = { ++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000 ++0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008 ++0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010 ++0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018 ++0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020 ++0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028 ++0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x02, // 0030 ++0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038 ++0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040 ++0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048 ++0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050 ++0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058 ++0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060 ++0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068 ++0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070 ++0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078 ++0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080 ++0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088 ++0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x06, 0x04, // 0090 ++0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098 ++0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0 ++0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8 ++0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0 ++0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8 ++0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0 ++0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8 ++0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0 ++0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8 ++0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0 ++0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8 ++0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0 ++0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8 ++0x00, 0x02, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100 ++0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108 ++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110 ++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118 ++0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120 ++0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128 ++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130 ++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138 ++0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140 ++0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148 ++0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150 ++0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158 ++0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160 ++0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168 ++0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170 ++0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178 ++0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180 ++0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188 ++0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190 ++0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198 ++0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0 ++0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8 ++0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0 ++0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8 ++0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0 ++0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8 ++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0 ++0x04, 0xb0, 0x00, 0x02, 0x65, 0x60, 0x91, 0x40, // 01d8 ++0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0 ++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8 ++0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0 ++0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8 ++0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200 ++0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208 ++0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210 ++0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218 ++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220 ++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228 ++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230 ++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238 ++0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240 ++0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248 ++0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250 ++0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258 ++0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260 ++0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268 ++0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270 ++0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278 ++0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280 ++0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288 ++0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290 ++0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298 ++0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0 ++0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8 ++0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0 ++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8 ++0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0 ++0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8 ++}; +diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h +new file mode 100644 +index 0000000000..1128a2c054 +--- /dev/null ++++ b/libavcodec/rpi_hevc_transform8.h +@@ -0,0 +1,94 @@ ++static const unsigned char rpi_hevc_transform8 [] = { ++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000 ++0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008 ++0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010 ++0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018 ++0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020 ++0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028 ++0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x08, // 0030 ++0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038 ++0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040 ++0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048 ++0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050 ++0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058 ++0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060 ++0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068 ++0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070 ++0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078 ++0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080 ++0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088 ++0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x04, 0x04, // 0090 ++0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098 ++0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0 ++0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8 ++0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0 ++0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8 ++0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0 ++0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8 ++0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0 ++0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8 ++0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0 ++0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8 ++0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0 ++0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8 ++0x00, 0x08, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100 ++0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108 ++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110 ++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118 ++0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120 ++0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128 ++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130 ++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138 ++0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140 ++0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148 ++0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150 ++0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158 ++0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160 ++0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168 ++0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170 ++0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178 ++0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180 ++0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188 ++0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190 ++0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198 ++0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0 ++0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8 ++0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0 ++0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8 ++0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0 ++0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8 ++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0 ++0x04, 0xb0, 0x00, 0x08, 0x45, 0x60, 0x91, 0x40, // 01d8 ++0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0 ++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8 ++0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0 ++0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8 ++0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200 ++0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208 ++0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210 ++0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218 ++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220 ++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228 ++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230 ++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238 ++0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240 ++0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248 ++0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250 ++0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258 ++0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260 ++0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268 ++0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270 ++0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278 ++0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280 ++0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288 ++0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290 ++0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298 ++0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0 ++0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8 ++0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0 ++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8 ++0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0 ++0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8 ++}; +diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c +new file mode 100644 +index 0000000000..e651e5c565 +--- /dev/null ++++ b/libavcodec/rpi_hevcdec.c +@@ -0,0 +1,6134 @@ ++/* ++ * HEVC video Decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2012 - 2013 Mickael Raulet ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2012 - 2013 Wassim Hamidouche ++ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/attributes.h" ++#include "libavutil/common.h" ++#include "libavutil/display.h" ++#include "libavutil/internal.h" ++#include "libavutil/mastering_display_metadata.h" ++#include "libavutil/md5.h" ++#include "libavutil/opt.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/stereo3d.h" ++ ++#include "decode.h" ++#include "bswapdsp.h" ++#include "bytestream.h" ++#include "golomb.h" ++#include "hevc.h" ++#include "rpi_hevc_data.h" ++#include "rpi_hevc_parse.h" ++#include "rpi_hevcdec.h" ++#include "rpi_hevc_cabac_fns.h" ++#include "profiles.h" ++#include "hwconfig.h" ++ ++#include "rpi_zc_frames.h" ++#include "rpi_qpu.h" ++#include "rpi_hevc_shader.h" ++#include "rpi_hevc_shader_cmd.h" ++#include "rpi_hevc_shader_template.h" ++#include "rpi_zc.h" ++#include "libavutil/rpi_sand_fns.h" ++ ++#include "pthread.h" ++#include ++ ++#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards ++ ++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff)) ++ ++#ifndef av_mod_uintp2 ++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p) ++{ ++ return a & ((1 << p) - 1); ++} ++# define av_mod_uintp2 av_mod_uintp2_c ++#endif ++ ++const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; ++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first); ++ ++#define MC_DUMMY_X (-32) ++#define MC_DUMMY_Y (-32) ++ ++// UV & Y both have min 4x4 pred (no 2x2 chroma) ++// Allow for even spread +1 for setup, +1 for rounding ++// As we have load sharing this can (in theory) be exceeded so we have to ++// check after each CTU, but it is a good base size ++ ++// Worst case (all 4x4) commands per CTU ++#define QPU_Y_CMD_PER_CTU_MAX (16 * 16) ++#define QPU_C_CMD_PER_CTU_MAX (8 * 8) ++ ++#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64) ++ ++#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP) ++#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS) ++ ++#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2) ++#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2) ++ ++// Total cmds to allocate - allow for slack & setup ++#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX) ++#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX) ++ ++#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2)) ++#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2)) ++ ++// The QPU code for UV blocks only works up to a block width of 8 ++#define RPI_CHROMA_BLOCK_WIDTH 8 ++ ++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24) ++ ++ ++// Actual filter goes -ve, +ve, +ve, -ve using these values ++static const uint32_t rpi_filter_coefs[8] = { ++ ENCODE_COEFFS( 0, 64, 0, 0), ++ ENCODE_COEFFS( 2, 58, 10, 2), ++ ENCODE_COEFFS( 4, 54, 16, 2), ++ ENCODE_COEFFS( 6, 46, 28, 4), ++ ENCODE_COEFFS( 4, 36, 36, 4), ++ ENCODE_COEFFS( 4, 28, 46, 6), ++ ENCODE_COEFFS( 2, 16, 54, 4), ++ ENCODE_COEFFS( 2, 10, 58, 2) ++}; ++ ++// Function arrays by QPU ++ ++static const int * const inter_pred_setup_c_qpu[12] = { ++ mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, ++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, ++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn ++}; ++ ++static const int * const inter_pred_setup_c10_qpu[12] = { ++ mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, ++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, ++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn ++}; ++ ++static const int * const inter_pred_setup_y_qpu[12] = { ++ mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, ++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, ++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn ++}; ++ ++static const int * const inter_pred_setup_y10_qpu[12] = { ++ mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, ++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, ++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn ++}; ++ ++static const int * const inter_pred_sync_qpu[12] = { ++ mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3, ++ mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7, ++ mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11 ++}; ++ ++static const int * const inter_pred_sync10_qpu[12] = { ++ mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3, ++ mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7, ++ mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11 ++}; ++ ++static const int * const inter_pred_exit_c_qpu[12] = { ++ mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, ++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, ++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn ++}; ++ ++static const int * const inter_pred_exit_c10_qpu[12] = { ++ mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, ++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, ++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn ++}; ++ ++static const int * const inter_pred_exit_y_qpu[12] = { ++ mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, ++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, ++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn ++}; ++ ++static const int * const inter_pred_exit_y10_qpu[12] = { ++ mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, ++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, ++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn ++}; ++ ++typedef struct ipe_chan_info_s ++{ ++ const uint8_t bit_depth; ++ const uint8_t n; ++ const int * const * setup_fns; ++ const int * const * sync_fns; ++ const int * const * exit_fns; ++} ipe_chan_info_t; ++ ++typedef struct ipe_init_info_s ++{ ++ ipe_chan_info_t luma; ++ ipe_chan_info_t chroma; ++} ipe_init_info_t; ++ ++static void set_bytes(uint8_t * b, const unsigned int stride, const int ln, unsigned int a) ++{ ++ switch (ln) ++ { ++ default: // normally 0 ++ *b = a; ++ break; ++ case 1: ++ a |= a << 8; ++ *(uint16_t *)b = a; ++ b += stride; ++ *(uint16_t *)b = a; ++ break; ++ case 2: ++ a |= a << 8; ++ a |= a << 16; ++ *(uint32_t *)b = a; ++ b += stride; ++ *(uint32_t *)b = a; ++ b += stride; ++ *(uint32_t *)b = a; ++ b += stride; ++ *(uint32_t *)b = a; ++ break; ++ case 3: ++ { ++ unsigned int i; ++ uint64_t d; ++ a |= a << 8; ++ a |= a << 16; ++ d = ((uint64_t)a << 32) | a; ++ for (i = 0; i != 8; ++i, b += stride) ++ *(uint64_t *)b = d; ++ break; ++ } ++ case 4: ++ { ++ unsigned int i; ++ uint64_t d; ++ a |= a << 8; ++ a |= a << 16; ++ d = ((uint64_t)a << 32) | a; ++ for (i = 0; i != 16; ++i, b += stride) ++ { ++ *(uint64_t *)b = d; ++ *(uint64_t *)(b + 8) = d; ++ } ++ break; ++ } ++ } ++} ++ ++// We expect this to be called with ln = (log2_cb_size - 3) so range = -1..3 ++// (4 not required) ++static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a) ++{ ++ switch (ln) ++ { ++ default: // 0 or -1 ++ *b_u = a; ++ *b_l = a; ++ break; ++ case 1: ++ a |= a << 8; ++ *(uint16_t *)b_u = a; ++ *(uint16_t *)b_l = a; ++ break; ++ case 2: ++ a |= a << 8; ++ a |= a << 16; ++ *(uint32_t *)b_u = a; ++ *(uint32_t *)b_l = a; ++ break; ++ case 3: ++ a |= a << 8; ++ a |= a << 16; ++ *(uint32_t *)b_u = a; ++ *(uint32_t *)(b_u + 4) = a; ++ *(uint32_t *)b_l = a; ++ *(uint32_t *)(b_l + 4) = a; ++ break; ++ case 4: ++ a |= a << 8; ++ a |= a << 16; ++ *(uint32_t *)b_u = a; ++ *(uint32_t *)(b_u + 4) = a; ++ *(uint32_t *)(b_u + 8) = a; ++ *(uint32_t *)(b_u + 12) = a; ++ *(uint32_t *)b_l = a; ++ *(uint32_t *)(b_l + 4) = a; ++ *(uint32_t *)(b_l + 8) = a; ++ *(uint32_t *)(b_l + 12) = a; ++ break; ++ } ++} ++ ++static void zap_cabac_stash(uint8_t * b, const int ln) ++{ ++ switch (ln) ++ { ++ default: // 0 ++ *b = 0; ++ break; ++ case 1: ++ *(uint16_t *)b = 0; ++ break; ++ case 2: ++ *(uint32_t *)b = 0; ++ break; ++ case 3: ++ *(uint32_t *)b = 0; ++ *(uint32_t *)(b + 4) = 0; ++ break; ++ } ++} ++ ++ ++ ++// Set a small square block of bits in a bitmap ++// Bits must be aligned on their size boundry (which will be true of all split CBs) ++static void set_bits(uint8_t * f, const unsigned int x, const unsigned int stride, const unsigned int ln) ++{ ++ unsigned int n; ++ const unsigned int sh = (x & 7); ++ ++ f += (x >> 3); ++ ++ av_assert2(ln <= 3); ++ av_assert2((x & ((1 << ln) - 1)) == 0); ++ ++ switch (ln) ++ { ++ default: // 1 ++ f[0] |= 1 << sh; ++ break; ++ case 1: // 3 * 2 ++ n = 3 << sh; ++ f[0] |= n; ++ f[stride] |= n; ++ break; ++ case 2: // 0xf * 4 ++ n = 0xf << sh; ++ f[0] |= n; ++ f[stride] |= n; ++ f[stride * 2] |= n; ++ f[stride * 3] |= n; ++ break; ++ case 3: // 0xff * 8 ++ for (n = 0; n != 8; ++n, f += stride) ++ *f = 0xff; ++ break; ++ } ++} ++ ++static const ipe_init_info_t ipe_init_infos[9] = { // Alloc for bit depths of 8-16 ++ { // 8 ++ .luma = {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu}, ++ .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu} ++ }, ++ { // 9 ++ .luma = {0}, ++ .chroma = {0} ++ }, ++ { // 10 ++ .luma = {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu}, ++ .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu} ++ } ++ ++}; ++ ++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici) ++{ ++ const unsigned int n = ici->n; ++ const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3; // Round down to word ++ ++ ipe->n = n; ++ ipe->max_fill = q1_size - ipe->min_gap; ++ for(unsigned int i = 0; i < n; i++) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ q->qpu_mc_curr = q->qpu_mc_base = ++ (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size); ++ q->code_setup = qpu_fn(ici->setup_fns[i]); ++ q->code_sync = qpu_fn(ici->sync_fns[i]); ++ q->code_exit = qpu_fn(ici->exit_fns[i]); ++ } ++} ++ ++static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth) ++{ ++ av_assert0(bit_depth >= 8 && bit_depth <= 16); ++ ++ rpi_hevc_qpu_init_fn(&s->qpu, bit_depth); ++} ++ ++// Unsigned Trivial MOD ++static inline unsigned int utmod(const unsigned int x, const unsigned int n) ++{ ++ return x >= n ? x - n : x; ++} ++ ++// returns pq->job_n++ ++static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq) ++{ ++ unsigned int const x2 = pq->job_n; ++ pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS); ++ return x2; ++} ++ ++static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n) ++{ ++ pq->terminate = 0; ++ pq->job_n = 0; ++ pq->context = s; ++ pq->worker = worker; ++ pq->psem_out = psem_out; ++ pq->pass_n = n; ++ pq->started = 0; ++ sem_init(&pq->sem_in, 0, 0); ++} ++ ++static void pass_queue_kill(HEVCRpiPassQueue * const pq) ++{ ++ sem_destroy(&pq->sem_in); ++} ++ ++static inline void rpi_sem_wait(sem_t * const sem) ++{ ++ while (sem_wait(sem) != 0) { ++ av_assert0(errno == EINTR); ++ } ++} ++ ++static void pass_queue_submit_job(HEVCRpiPassQueue * const pq) ++{ ++ sem_post(&pq->sem_in); ++} ++ ++static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ // Do the various passes - common with the worker code ++ for (unsigned int i = 0; i != RPI_PASSES; ++i) { ++ s->passq[i].worker(s, jb); ++ } ++} ++ ++ ++#if 0 ++static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func) ++{ ++ int x; ++ sem_getvalue((sem_t *)&jbc->sem_out, &x); ++ printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x); ++} ++#endif ++ ++ ++static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJob * jb; ++ HEVCRpiJobGlobal * const jbg = jbc->jbg; ++ ++ pthread_mutex_lock(&jbg->lock); ++ // Check local 1st ++ if ((jb = jbc->jb1) != NULL) ++ { ++ // Only 1 - very easy :-) ++ jbc->jb1 = NULL; ++ } ++ else ++ { ++ // Now look for global free chain ++ if ((jb = jbg->free1) != NULL) ++ { ++ // Found one - unlink it ++ jbg->free1 = jb->next; ++ jb->next = NULL; ++ } ++ else ++ { ++ // Out of places to look - wait for one to become free - add to Qs ++ ++ // Global ++ // If "good" lc then add after the last "good" el in the chain ++ // otherwise add to the tail ++ if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good) ++ { ++ // Add to end as we had to wait last time or wait Q empty ++ if ((lc->jw_prev = jbg->wait_tail) == NULL) ++ jbg->wait_head = lc; ++ else ++ lc->jw_prev->jw_next = lc; ++ lc->jw_next = NULL; ++ jbg->wait_tail = lc; ++ } ++ else ++ { ++ // This is a "good" lc that we need to poke into the middle ++ // of the Q ++ // We know that the Q isn't empty and there is at least one ++ // !last_progess_good el in it from the previous test ++ ++ HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after ++ ++ if (p == NULL) ++ { ++ // No current good els - add to head ++ lc->jw_next = jbg->wait_head; ++ jbg->wait_head = lc; ++ } ++ else ++ { ++ lc->jw_next = p->jw_next; ++ p->jw_next = lc; ++ } ++ ++ lc->jw_next->jw_prev = lc; ++ lc->jw_prev = p; ++ } ++ ++ // If "good" then we are now the last good waiting el ++ if (lc->last_progress_good) ++ jbg->wait_good = lc; ++ ++ // Local ++ if ((lc->ljw_prev = jbc->lcw_tail) == NULL) ++ jbc->lcw_head = lc; ++ else ++ lc->ljw_prev->ljw_next = lc; ++ lc->ljw_next = NULL; ++ jbc->lcw_tail = lc; ++ } ++ } ++ ++ pthread_mutex_unlock(&jbg->lock); ++ ++ if (jb == NULL) // Need to wait ++ { ++ rpi_sem_wait(&lc->jw_sem); ++ jb = lc->jw_job; // Set by free code ++ } ++ ++ return jb; ++} ++ ++ ++static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb) ++{ ++ HEVCRpiJobGlobal * const jbg = jbc0->jbg; // This jbc only used to find jbg so we can get the lock ++ HEVCRpiJobCtl * jbc = jb->jbc_local; ++ HEVCRpiLocalContext * lc = NULL; ++ ++ pthread_mutex_lock(&jbg->lock); ++ ++ if (jbc != NULL) ++ { ++ av_assert1(jbc->jb1 == NULL); ++ ++ // Release to Local if nothing waiting there ++ if ((lc = jbc->lcw_head) == NULL) ++ jbc->jb1 = jb; ++ } ++ else ++ { ++ // Release to global if nothing waiting there ++ if ((lc = jbg->wait_head) == NULL) ++ { ++ jb->next = jbg->free1; ++ jbg->free1 = jb; ++ } ++ else ++ { ++ // ? seems somehow mildy ugly... ++ jbc = lc->context->jbc; ++ } ++ } ++ ++ if (lc != NULL) ++ { ++ // Something was waiting ++ ++ // Unlink ++ // Global ++ if (lc->jw_next == NULL) ++ jbg->wait_tail = lc->jw_prev; ++ else ++ lc->jw_next->jw_prev = lc->jw_prev; ++ ++ if (lc->jw_prev == NULL) ++ jbg->wait_head = lc->jw_next; ++ else ++ lc->jw_prev->jw_next = lc->jw_next; ++ ++ // Local ++ if (lc->ljw_next == NULL) ++ jbc->lcw_tail = lc->ljw_prev; ++ else ++ lc->ljw_next->ljw_prev = lc->ljw_prev; ++ ++ if (lc->ljw_prev == NULL) ++ jbc->lcw_head = lc->ljw_next; ++ else ++ lc->ljw_prev->ljw_next = lc->ljw_next; ++ ++ // Update good if required ++ if (jbg->wait_good == lc) ++ jbg->wait_good = lc->jw_prev; ++ ++ // Prod ++ lc->jw_job = jb; ++ sem_post(&lc->jw_sem); ++ } ++ ++ pthread_mutex_unlock(&jbg->lock); ++} ++ ++static void job_lc_kill(HEVCRpiLocalContext * const lc) ++{ ++ sem_destroy(&lc->jw_sem); ++} ++ ++static void job_lc_init(HEVCRpiLocalContext * const lc) ++{ ++ lc->jw_next = NULL; ++ lc->jw_prev = NULL; ++ lc->ljw_next = NULL; ++ lc->ljw_prev = NULL; ++ lc->jw_job = NULL; ++ sem_init(&lc->jw_sem, 0, 0); ++} ++ ++// Returns: ++// 0 if we have waited for MV or expect to wait for recon ++// 1 if we haven't waited for MV & do not need to wait for recon ++static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb) ++{ ++ if (jb->waited) // reset by rpi_begin ++ return 0; ++ for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) ++ { ++ if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL && ++ ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i]) ++ return 0; ++ } ++ return 1; ++} ++ ++// Submit job if it is full (indicated by having ctu_ts_last set >= 0) ++static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJobCtl *const jbc = s->jbc; ++ HEVCRpiJob * const jb = lc->jb0; ++ ++ av_assert1(jb != NULL); ++ ++ if (jb->ctu_ts_last < 0) { ++ return; ++ } ++ ++ lc->last_progress_good = progress_good(s, jb); ++ jb->waited = !lc->last_progress_good; ++ lc->jb0 = NULL; ++ ++ if (s->offload_recon) ++ { ++ pthread_mutex_lock(&jbc->in_lock); ++ jbc->offloadq[jbc->offload_in] = jb; ++ jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS); ++ pthread_mutex_unlock(&jbc->in_lock); ++ ++ pass_queue_submit_job(s->passq + 0); // Consumes job eventually ++ } ++ else ++ { ++ pass_queue_do_all(s, jb); // Consumes job before return ++ } ++} ++ ++ ++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes ++// available to receive the next job. ++// ++// Now safe against multiple callers - needed for tiles ++// "normal" and WPP will only call here one at a time ++static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJobCtl * const jbc = s->jbc; ++ ++ // It is legit for us to already have a job allocated - do nothing in this case ++ if (lc->jb0 != NULL) ++ return; ++ ++ if (s->offload_recon) ++ rpi_sem_wait(&jbc->sem_out); // This sem will stop this frame grabbing too much ++ ++ lc->jb0 = job_alloc(jbc, lc); ++ ++ rpi_begin(s, lc->jb0, lc->ts); ++} ++ ++// Free up a job without submission ++static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJobCtl * const jbc = s->jbc; ++ HEVCRpiJob * const jb = lc->jb0; ++ ++ if (jb == NULL) { ++ return; ++ } ++ ++ lc->jb0 = NULL; ++ ++ job_free(jbc, jb); ++ ++ // If offload then poke sem_out too ++ if (s->offload_recon) { ++ sem_post(&jbc->sem_out); ++ } ++} ++ ++ ++// Call this to wait for all jobs to have completed at the end of a frame ++// Slightly icky as there is no clean way to wait for a sem to count up ++// Not reentrant - call on main thread only ++static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJobCtl * const jbc = s->jbc; ++ int i = 0; ++ ++ // We shouldn't reach here with an unsubmitted job ++ av_assert1(lc->jb0 == NULL); ++ ++ // If no offload then there can't be anything to wait for ++ if (!s->offload_recon) { ++ return; ++ } ++ ++ if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS) ++ { ++ for (i = 0; i != RPI_MAX_JOBS; ++i) { ++ rpi_sem_wait(&jbc->sem_out); ++ } ++ for (i = 0; i != RPI_MAX_JOBS; ++i) { ++ sem_post(&jbc->sem_out); ++ } ++ } ++} ++ ++static void * pass_worker(void *arg) ++{ ++ HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg; ++ HEVCRpiContext *const s = pq->context; ++ ++ for (;;) ++ { ++ rpi_sem_wait(&pq->sem_in); ++ ++ if (pq->terminate) ++ break; ++ ++ pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]); ++ // * should really set jb->passes_done here ++ ++ sem_post(pq->psem_out); ++ } ++ return NULL; ++} ++ ++static void pass_queues_start_all(HEVCRpiContext *const s) ++{ ++ unsigned int i; ++ HEVCRpiPassQueue * const pqs = s->passq; ++ ++ for (i = 0; i != RPI_PASSES; ++i) ++ { ++ av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0); ++ pqs[i].started = 1; ++ } ++} ++ ++static void pass_queues_term_all(HEVCRpiContext *const s) ++{ ++ unsigned int i; ++ HEVCRpiPassQueue * const pqs = s->passq; ++ ++ for (i = 0; i != RPI_PASSES; ++i) ++ pqs[i].terminate = 1; ++ for (i = 0; i != RPI_PASSES; ++i) ++ { ++ if (pqs[i].started) ++ sem_post(&pqs[i].sem_in); ++ } ++ for (i = 0; i != RPI_PASSES; ++i) ++ { ++ if (pqs[i].started) { ++ pthread_join(pqs[i].thread, NULL); ++ pqs[i].started = 0; ++ } ++ } ++} ++ ++static void pass_queues_kill_all(HEVCRpiContext *const s) ++{ ++ unsigned int i; ++ HEVCRpiPassQueue * const pqs = s->passq; ++ ++ for (i = 0; i != RPI_PASSES; ++i) ++ pass_queue_kill(pqs + i); ++} ++ ++ ++static void worker_pic_free_one(HEVCRpiJob * const jb) ++{ ++ // Free coeff stuff - allocation not the same for all buffers ++ HEVCRpiCoeffsEnv * const cf = &jb->coeffs; ++ ++ if (cf->s[0].buf != NULL) ++ av_freep(&cf->mptr); ++ if (cf->s[2].buf != NULL) ++ gpu_free(&cf->gptr); ++ memset(cf, 0, sizeof(*cf)); ++} ++ ++static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count) ++{ ++ HEVCRpiCoeffsEnv * const cf = &jb->coeffs; ++ ++ if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0) ++ goto fail; ++ cf->s[2].buf = (int16_t *)cf->gptr.arm; ++ cf->s[3].buf = cf->s[2].buf + coeff_count; ++ ++ // Must be 64 byte aligned for our zero zapping code so over-allocate & ++ // round ++ if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL) ++ goto fail; ++ cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63); ++ return 0; ++ ++fail: ++ av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__); ++ worker_pic_free_one(jb); ++ return -1; ++} ++ ++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf) ++{ ++ unsigned int i; ++ for (i = 0; i != 4; ++i) { ++ cf->s[i].n = 0; ++#if RPI_COMPRESS_COEFFS ++ cf->s[i].packed = 1; ++ cf->s[i].packed_n = 0; ++#endif ++ } ++} ++ ++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n) ++{ ++ HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no; ++ int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n); ++ cfe->n += n; ++ return coeffs; ++} ++ ++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const HEVCRpiFrame * const ref, const int val, const int field) ++{ ++ if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) { ++ HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data; ++ HEVCRpiFrameProgressState * const pstate = fs->progress_states + field; ++ sem_t * sem = NULL; ++ ++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0); ++ if (((volatile int *)ref->tf.progress->data)[field] < val) { ++ HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait; ++ ++ av_assert1(pwait->req == -1 && pwait->next == NULL); ++ jb->waited = 1; // Remember that we had to wait for later scheduling ++ ++ pwait->req = val; ++ pwait->next = NULL; ++ if (pstate->first == NULL) ++ pstate->first = pwait; ++ else ++ pstate->last->next = pwait; ++ pstate->last = pwait; ++ sem = &pwait->sem; ++ } ++ pthread_mutex_unlock(&pstate->lock); ++ ++ if (sem != NULL) { ++ rpi_sem_wait(sem); ++ } ++ } ++} ++ ++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field) ++{ ++ HEVCRpiFrameProgressState *const pstate = s->progress_states + field; ++ ++ ((int *)s->ref->tf.progress->data)[field] = val; ++ ++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0); ++ { ++ HEVCRpiFrameProgressWait ** ppwait = &pstate->first; ++ HEVCRpiFrameProgressWait * pwait; ++ ++ while ((pwait = *ppwait) != NULL) { ++ if (pwait->req > val) ++ { ++ ppwait = &pwait->next; ++ pstate->last = pwait; ++ } ++ else ++ { ++ *ppwait = pwait->next; ++ pwait->req = -1; ++ pwait->next = NULL; ++ sem_post(&pwait->sem); ++ } ++ } ++ } ++ pthread_mutex_unlock(&pstate->lock); ++} ++ ++static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate) ++{ ++ pstate->first = NULL; ++ pstate->last = NULL; ++ pthread_mutex_init(&pstate->lock, NULL); ++} ++ ++static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait) ++{ ++ pwait->req = -1; ++ pwait->next = NULL; ++ sem_init(&pwait->sem, 0, 0); ++} ++ ++static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate) ++{ ++ av_assert1(pstate->first == NULL); ++ pthread_mutex_destroy(&pstate->lock); ++} ++ ++static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait) ++{ ++ sem_destroy(&pwait->sem); ++} ++ ++ ++/** ++ * NOTE: Each function hls_foo correspond to the function foo in the ++ * specification (HLS stands for High Level Syntax). ++ */ ++ ++/** ++ * Section 5.7 ++ */ ++ ++// Realloc the entry point arrays ++static int alloc_entry_points(RpiSliceHeader * const sh, const int n) ++{ ++ if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0) ++ { ++ // Round up alloc to multiple of 32 ++ int a = (n + 31) & ~31; ++ ++ // We don't care about the previous contents so probably fastest to simply discard ++ av_freep(&sh->entry_point_offset); ++ av_freep(&sh->offset); ++ av_freep(&sh->size); ++ ++ if (a != 0) ++ { ++ sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned)); ++ sh->offset = av_malloc_array(a, sizeof(int)); ++ sh->size = av_malloc_array(a, sizeof(int)); ++ ++ if (!sh->entry_point_offset || !sh->offset || !sh->size) { ++ sh->num_entry_point_offsets = 0; ++ sh->offsets_allocated = 0; ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ sh->offsets_allocated = a; ++ } ++ ++ return 0; ++} ++ ++/* free everything allocated by pic_arrays_init() */ ++static void pic_arrays_free(HEVCRpiContext *s) ++{ ++ av_freep(&s->sao); ++ av_freep(&s->deblock); ++ ++ av_freep(&s->cabac_stash_up); ++ s->cabac_stash_left = NULL; // freed with _up ++ ++ av_freep(&s->mvf_up); ++ av_freep(&s->mvf_left); ++ ++ av_freep(&s->is_pcm); ++ av_freep(&s->is_intra_store); ++ s->is_intra = NULL; ++ av_freep(&s->rpl_tab); ++ s->rpl_tab_size = 0; ++ ++ av_freep(&s->qp_y_tab); ++ av_freep(&s->tab_slice_address); ++ av_freep(&s->filter_slice_edges); ++ ++ av_freep(&s->bs_horizontal); ++ s->bs_vertical = NULL; // freed with H ++ av_freep(&s->bsf_stash_left); ++ av_freep(&s->bsf_stash_up); ++ ++ av_freep(&s->rpl_up); ++ av_freep(&s->rpl_left); ++ ++ alloc_entry_points(&s->sh, 0); ++ ++ av_buffer_pool_uninit(&s->col_mvf_pool); ++} ++ ++/* allocate arrays that depend on frame dimensions */ ++static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps) ++{ ++ const unsigned int log2_min_cb_size = sps->log2_min_cb_size; ++ const unsigned int width = sps->width; ++ const unsigned int height = sps->height; ++ const unsigned int pic_size_in_cb = ((width >> log2_min_cb_size) + 1) * ++ ((height >> log2_min_cb_size) + 1); ++ const unsigned int ctb_count = sps->ctb_size; ++ ++ { ++ unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK); ++ unsigned int h = ((height + 15) & ~15); ++ ++ s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size ++ s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols ++ } ++ ++ s->sao = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly ++ s->deblock = av_mallocz_array(ctb_count, sizeof(*s->deblock)); ++ if (!s->sao || !s->deblock) ++ goto fail; ++ ++ s->cabac_stash_up = av_malloc((((width + 63) & ~63) >> 3) + (((height + 63) & ~63) >> 3)); ++ s->cabac_stash_left = s->cabac_stash_up + (((width + 63) & ~63) >> 3); ++ if (s->cabac_stash_up == NULL) ++ goto fail; ++ ++ // Round width up to max ctb size ++ s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up)); ++ // * Only needed if we have H tiles ++ s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up)); ++ ++ // We can overread by 1 line & one byte in deblock so alloc & zero ++ // We don't need to zero the extra @ start of frame as it will never be ++ // written ++ s->is_pcm = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1); ++ s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1); ++ if (s->is_pcm == NULL || s->is_intra_store == NULL) ++ goto fail; ++ ++ s->filter_slice_edges = av_mallocz(ctb_count); ++ s->tab_slice_address = av_malloc_array(ctb_count, ++ sizeof(*s->tab_slice_address)); ++ s->qp_y_tab = av_malloc_array(pic_size_in_cb, ++ sizeof(*s->qp_y_tab)); ++ if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address) ++ goto fail; ++ ++ s->bs_horizontal = av_mallocz(s->bs_size * 2); ++ s->bs_vertical = s->bs_horizontal + s->bs_size; ++ if (s->bs_horizontal == NULL) ++ goto fail; ++ ++ s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up)); ++ s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left)); ++ if (s->rpl_left == NULL || s->rpl_up == NULL) ++ goto fail; ++ ++ if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL || ++ (s->bsf_stash_up = av_mallocz(((width + 63) & ~63) >> 4)) == NULL) ++ goto fail; ++ ++ s->col_mvf_stride = (width + 15) >> 4; ++ s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField), ++ av_buffer_allocz); ++ if (s->col_mvf_pool == NULL) ++ goto fail; ++ ++ return 0; ++ ++fail: ++ pic_arrays_free(s); ++ return AVERROR(ENOMEM); ++} ++ ++static void default_pred_weight_table(HEVCRpiContext * const s) ++{ ++ unsigned int i; ++ const unsigned int wt = 1 << QPU_MC_DENOM; ++ s->sh.luma_log2_weight_denom = 0; ++ s->sh.chroma_log2_weight_denom = 0; ++ for (i = 0; i < s->sh.nb_refs[L0]; i++) { ++ s->sh.luma_weight_l0[i] = wt; ++ s->sh.luma_offset_l0[i] = 0; ++ s->sh.chroma_weight_l0[i][0] = wt; ++ s->sh.chroma_weight_l0[i][1] = wt; ++ s->sh.chroma_offset_l0[i][0] = 0; ++ s->sh.chroma_offset_l0[i][1] = 0; ++ } ++ for (i = 0; i < s->sh.nb_refs[L1]; i++) { ++ s->sh.luma_weight_l1[i] = wt; ++ s->sh.luma_offset_l1[i] = 0; ++ s->sh.chroma_weight_l1[i][0] = wt; ++ s->sh.chroma_weight_l1[i][1] = wt; ++ s->sh.chroma_offset_l1[i][0] = 0; ++ s->sh.chroma_offset_l1[i][1] = 0; ++ } ++} ++ ++static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb, ++ const unsigned int refs, ++ int16_t * luma_weight, int16_t * luma_offset, ++ int16_t * chroma_weight, int16_t * chroma_offset) ++{ ++ unsigned int luma_flags; ++ unsigned int chroma_flags; ++ unsigned int i; ++ const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8); ++ const int wp_offset_half_range = s->ps.sps->wp_offset_half_range; ++ const unsigned int luma_weight_base = 1 << QPU_MC_DENOM; ++ const unsigned int chroma_weight_base = 1 << QPU_MC_DENOM; ++ const unsigned int luma_weight_shift = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom); ++ const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom); ++ ++ if (refs == 0) ++ return 0; ++ ++ luma_flags = get_bits(gb, refs); ++ chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs); ++ i = 1 << (refs - 1); ++ ++ do ++ { ++ if ((luma_flags & i) != 0) ++ { ++ const int delta_weight = get_se_golomb(gb); ++ const int offset = get_se_golomb(gb); ++ if (delta_weight < -128 || delta_weight > 127 || ++ offset < -wp_offset_half_range || offset >= wp_offset_half_range) ++ { ++ return AVERROR_INVALIDDATA; ++ } ++ *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift); ++ *luma_offset++ = offset << wp_offset_bd_shift; ++ } ++ else ++ { ++ *luma_weight++ = luma_weight_base; ++ *luma_offset++ = 0; ++ } ++ ++ if ((chroma_flags & i) != 0) ++ { ++ unsigned int j; ++ for (j = 0; j != 2; ++j) ++ { ++ const int delta_weight = get_se_golomb(gb); ++ const int delta_offset = get_se_golomb(gb); ++ ++ if (delta_weight < -128 || delta_weight > 127 || ++ delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range) ++ { ++ return AVERROR_INVALIDDATA; ++ } ++ ++ *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift); ++ *chroma_offset++ = av_clip( ++ wp_offset_half_range + delta_offset - ++ ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom), ++ -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift; ++ } ++ } ++ else ++ { ++ *chroma_weight++ = chroma_weight_base; ++ *chroma_weight++ = chroma_weight_base; ++ *chroma_offset++ = 0; ++ *chroma_offset++ = 0; ++ } ++ } while ((i >>= 1) != 0); ++ ++ return 0; ++} ++ ++static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb) ++{ ++ int err; ++ const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb); ++ const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb); ++ ++ if (luma_log2_weight_denom > 7 || ++ chroma_log2_weight_denom > 7) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n", ++ luma_log2_weight_denom, chroma_log2_weight_denom); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ s->sh.luma_log2_weight_denom = luma_log2_weight_denom; ++ s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom; ++ ++ if ((err = get_weights(s, gb, s->sh.nb_refs[L0], ++ s->sh.luma_weight_l0, s->sh.luma_offset_l0, ++ s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 || ++ (err = get_weights(s, gb, s->sh.nb_refs[L1], ++ s->sh.luma_weight_l1, s->sh.luma_offset_l1, ++ s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n"); ++ return err; ++ } ++ ++ return 0; ++} ++ ++static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb) ++{ ++ const HEVCRpiSPS *sps = s->ps.sps; ++ int max_poc_lsb = 1 << sps->log2_max_poc_lsb; ++ int prev_delta_msb = 0; ++ unsigned int nb_sps = 0, nb_sh; ++ int i; ++ ++ rps->nb_refs = 0; ++ if (!sps->long_term_ref_pics_present_flag) ++ return 0; ++ ++ if (sps->num_long_term_ref_pics_sps > 0) ++ nb_sps = get_ue_golomb_long(gb); ++ nb_sh = get_ue_golomb_long(gb); ++ ++ if (nb_sps > sps->num_long_term_ref_pics_sps) ++ return AVERROR_INVALIDDATA; ++ if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc)) ++ return AVERROR_INVALIDDATA; ++ ++ rps->nb_refs = nb_sh + nb_sps; ++ ++ for (i = 0; i < rps->nb_refs; i++) { ++ uint8_t delta_poc_msb_present; ++ ++ if (i < nb_sps) { ++ uint8_t lt_idx_sps = 0; ++ ++ if (sps->num_long_term_ref_pics_sps > 1) ++ lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps)); ++ ++ rps->poc[i] = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps]; ++ rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps]; ++ } else { ++ rps->poc[i] = get_bits(gb, sps->log2_max_poc_lsb); ++ rps->used[i] = get_bits1(gb); ++ } ++ ++ delta_poc_msb_present = get_bits1(gb); ++ if (delta_poc_msb_present) { ++ int64_t delta = get_ue_golomb_long(gb); ++ int64_t poc; ++ ++ if (i && i != nb_sps) ++ delta += prev_delta_msb; ++ ++ poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb; ++ if (poc != (int32_t)poc) ++ return AVERROR_INVALIDDATA; ++ rps->poc[i] = poc; ++ prev_delta_msb = delta; ++ } ++ } ++ ++ return 0; ++} ++ ++static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps, ++ const HEVCRpiSPS *sps) ++{ ++ const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data; ++ const HEVCRpiWindow *ow = &sps->output_window; ++ unsigned int num = 0, den = 0; ++ ++ avctx->pix_fmt = sps->pix_fmt; ++ avctx->coded_width = sps->width; ++ avctx->coded_height = sps->height; ++ avctx->width = sps->width - ow->left_offset - ow->right_offset; ++ avctx->height = sps->height - ow->top_offset - ow->bottom_offset; ++ avctx->has_b_frames = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics; ++ avctx->profile = sps->ptl.general_ptl.profile_idc; ++ avctx->level = sps->ptl.general_ptl.level_idc; ++ ++ ff_set_sar(avctx, sps->vui.sar); ++ ++ if (sps->vui.video_signal_type_present_flag) ++ avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG ++ : AVCOL_RANGE_MPEG; ++ else ++ avctx->color_range = AVCOL_RANGE_MPEG; ++ ++ if (sps->vui.colour_description_present_flag) { ++ avctx->color_primaries = sps->vui.colour_primaries; ++ avctx->color_trc = sps->vui.transfer_characteristic; ++ avctx->colorspace = sps->vui.matrix_coeffs; ++ } else { ++ avctx->color_primaries = AVCOL_PRI_UNSPECIFIED; ++ avctx->color_trc = AVCOL_TRC_UNSPECIFIED; ++ avctx->colorspace = AVCOL_SPC_UNSPECIFIED; ++ } ++ ++ if (vps->vps_timing_info_present_flag) { ++ num = vps->vps_num_units_in_tick; ++ den = vps->vps_time_scale; ++ } else if (sps->vui.vui_timing_info_present_flag) { ++ num = sps->vui.vui_num_units_in_tick; ++ den = sps->vui.vui_time_scale; ++ } ++ ++ if (num != 0 && den != 0) ++ av_reduce(&avctx->framerate.den, &avctx->framerate.num, ++ num, den, 1 << 30); ++} ++ ++static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps) ++{ ++ enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts; ++ ++ // Admit to no h/w formats ++ ++ *fmt++ = sps->pix_fmt; ++ *fmt = AV_PIX_FMT_NONE; ++ ++ return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts); ++} ++ ++static int is_sps_supported(const HEVCRpiSPS * const sps) ++{ ++ return av_rpi_is_sand_format(sps->pix_fmt) && ++ sps->width <= HEVC_RPI_MAX_WIDTH && ++ sps->height <= HEVC_RPI_MAX_HEIGHT; ++} ++ ++static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps, ++ const enum AVPixelFormat pix_fmt) ++{ ++ int ret; ++ ++ pic_arrays_free(s); ++ s->ps.sps = NULL; ++ s->ps.vps = NULL; ++ ++ if (sps == NULL) ++ return 0; ++ ++ if (!is_sps_supported(sps)) ++ return AVERROR_DECODER_NOT_FOUND; ++ ++ ret = pic_arrays_init(s, sps); ++ if (ret < 0) ++ goto fail; ++ ++ export_stream_params(s->avctx, &s->ps, sps); ++ ++ s->avctx->pix_fmt = pix_fmt; ++ ++ ff_hevc_rpi_pred_init(&s->hpc, sps->bit_depth); ++ ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth); ++ ++ // * We don't support cross_component_prediction_enabled_flag but as that ++ // must be 0 unless we have 4:4:4 there is no point testing for it as we ++ // only deal with sand which is never 4:4:4 ++ // [support wouldn't be hard] ++ ++ rpi_hevc_qpu_set_fns(s, sps->bit_depth); ++ ++ av_freep(&s->sao_pixel_buffer_h[0]); ++ av_freep(&s->sao_pixel_buffer_v[0]); ++ ++ if (sps->sao_enabled) ++ { ++ const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1; ++ unsigned int c_idx; ++ size_t vsize[3] = {0}; ++ size_t hsize[3] = {0}; ++ ++ for(c_idx = 0; c_idx < c_count; c_idx++) { ++ int w = sps->width >> ctx_hshift(s, c_idx); ++ int h = sps->height >> ctx_vshift(s, c_idx); ++ // ctb height & width are a min of 8 so this must a multiple of 16 ++ // so no point rounding up! ++ hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift; ++ vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift; ++ } ++ ++ // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2] ++ // when we have plaited chroma ++ s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]); ++ s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]); ++ s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0]; ++ s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1]; ++ s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0]; ++ s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1]; ++ } ++ ++ s->ps.sps = sps; ++ s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data; ++ ++ return 0; ++ ++fail: ++ pic_arrays_free(s); ++ s->ps.sps = NULL; ++ return ret; ++} ++ ++static inline int qp_offset_valid(const int qp_offset) ++{ ++ return qp_offset >= -12 && qp_offset <= 12; ++} ++ ++static int hls_slice_header(HEVCRpiContext * const s) ++{ ++ GetBitContext * const gb = &s->HEVClc->gb; ++ RpiSliceHeader * const sh = &s->sh; ++ int i, ret; ++ ++ // Coded parameters ++ sh->first_slice_in_pic_flag = get_bits1(gb); ++ if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) { ++ s->seq_decode = (s->seq_decode + 1) & 0xff; ++ s->max_ra = INT_MAX; ++ if (IS_IDR(s)) ++ ff_hevc_rpi_clear_refs(s); ++ } ++ sh->no_output_of_prior_pics_flag = 0; ++ if (IS_IRAP(s)) ++ sh->no_output_of_prior_pics_flag = get_bits1(gb); ++ ++ sh->pps_id = get_ue_golomb_long(gb); ++ if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) { ++ av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id); ++ return AVERROR_INVALIDDATA; ++ } ++ if (!sh->first_slice_in_pic_flag && ++ s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) { ++ av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data; ++ if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1) ++ sh->no_output_of_prior_pics_flag = 1; ++ ++ if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) { ++ const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data; ++ const HEVCRpiSPS *last_sps = s->ps.sps; ++ enum AVPixelFormat pix_fmt; ++ ++ if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) { ++ if (sps->width != last_sps->width || sps->height != last_sps->height || ++ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering != ++ last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering) ++ sh->no_output_of_prior_pics_flag = 0; ++ } ++ ff_hevc_rpi_clear_refs(s); ++ ++ ret = set_sps(s, sps, sps->pix_fmt); ++ if (ret < 0) ++ return ret; ++ ++ pix_fmt = get_format(s, sps); ++ if (pix_fmt < 0) ++ return pix_fmt; ++ ++// ret = set_sps(s, sps, pix_fmt); ++// if (ret < 0) ++// return ret; ++ ++ s->avctx->pix_fmt = pix_fmt; ++ ++ s->seq_decode = (s->seq_decode + 1) & 0xff; ++ s->max_ra = INT_MAX; ++ } ++ ++ sh->dependent_slice_segment_flag = 0; ++ if (!sh->first_slice_in_pic_flag) { ++ int slice_address_length; ++ ++ if (s->ps.pps->dependent_slice_segments_enabled_flag) ++ sh->dependent_slice_segment_flag = get_bits1(gb); ++ ++ slice_address_length = av_ceil_log2(s->ps.sps->ctb_size); ++ sh->slice_segment_addr = get_bitsz(gb, slice_address_length); ++ if (sh->slice_segment_addr >= s->ps.sps->ctb_size) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Invalid slice segment address: %u.\n", ++ sh->slice_segment_addr); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (!sh->dependent_slice_segment_flag) { ++ sh->slice_addr = sh->slice_segment_addr; ++ s->slice_idx++; ++ } ++ } else { ++ sh->slice_segment_addr = sh->slice_addr = 0; ++ s->slice_idx = 0; ++ s->slice_initialized = 0; ++ } ++ ++ if (!sh->dependent_slice_segment_flag) { ++ s->slice_initialized = 0; ++ ++ for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++) ++ skip_bits(gb, 1); // slice_reserved_undetermined_flag[] ++ ++ sh->slice_type = get_ue_golomb_long(gb); ++ if (!(sh->slice_type == HEVC_SLICE_I || ++ sh->slice_type == HEVC_SLICE_P || ++ sh->slice_type == HEVC_SLICE_B)) { ++ av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n", ++ sh->slice_type); ++ return AVERROR_INVALIDDATA; ++ } ++ if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) { ++ av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ // when flag is not present, picture is inferred to be output ++ sh->pic_output_flag = 1; ++ if (s->ps.pps->output_flag_present_flag) ++ sh->pic_output_flag = get_bits1(gb); ++ ++ if (s->ps.sps->separate_colour_plane_flag) ++ sh->colour_plane_id = get_bits(gb, 2); ++ ++ if (!IS_IDR(s)) { ++ int poc, pos; ++ ++ sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb); ++ poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type); ++ if (!sh->first_slice_in_pic_flag && poc != s->poc) { ++ av_log(s->avctx, AV_LOG_WARNING, ++ "Ignoring POC change between slices: %d -> %d\n", s->poc, poc); ++ if (s->avctx->err_recognition & AV_EF_EXPLODE) ++ return AVERROR_INVALIDDATA; ++ poc = s->poc; ++ } ++ s->poc = poc; ++ ++ sh->short_term_ref_pic_set_sps_flag = get_bits1(gb); ++ pos = get_bits_left(gb); ++ if (!sh->short_term_ref_pic_set_sps_flag) { ++ ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1); ++ if (ret < 0) ++ return ret; ++ ++ sh->short_term_rps = &sh->slice_rps; ++ } else { ++ int numbits, rps_idx; ++ ++ if (!s->ps.sps->nb_st_rps) { ++ av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ numbits = av_ceil_log2(s->ps.sps->nb_st_rps); ++ rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0; ++ sh->short_term_rps = &s->ps.sps->st_rps[rps_idx]; ++ } ++ sh->short_term_ref_pic_set_size = pos - get_bits_left(gb); ++ ++ pos = get_bits_left(gb); ++ ret = decode_lt_rps(s, &sh->long_term_rps, gb); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n"); ++ if (s->avctx->err_recognition & AV_EF_EXPLODE) ++ return AVERROR_INVALIDDATA; ++ } ++ sh->long_term_ref_pic_set_size = pos - get_bits_left(gb); ++ ++ if (s->ps.sps->sps_temporal_mvp_enabled_flag) ++ sh->slice_temporal_mvp_enabled_flag = get_bits1(gb); ++ else ++ sh->slice_temporal_mvp_enabled_flag = 0; ++ } else { ++ s->sh.short_term_rps = NULL; ++ s->poc = 0; ++ } ++ ++ /* 8.3.1 */ ++ if (sh->first_slice_in_pic_flag && s->temporal_id == 0 && ++ s->nal_unit_type != HEVC_NAL_TRAIL_N && ++ s->nal_unit_type != HEVC_NAL_TSA_N && ++ s->nal_unit_type != HEVC_NAL_STSA_N && ++ s->nal_unit_type != HEVC_NAL_RADL_N && ++ s->nal_unit_type != HEVC_NAL_RADL_R && ++ s->nal_unit_type != HEVC_NAL_RASL_N && ++ s->nal_unit_type != HEVC_NAL_RASL_R) ++ s->pocTid0 = s->poc; ++ ++ if (s->ps.sps->sao_enabled) { ++ sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb); ++ if (ctx_cfmt(s) != 0) { ++ sh->slice_sample_adaptive_offset_flag[1] = ++ sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb); ++ } ++ } else { ++ sh->slice_sample_adaptive_offset_flag[0] = 0; ++ sh->slice_sample_adaptive_offset_flag[1] = 0; ++ sh->slice_sample_adaptive_offset_flag[2] = 0; ++ } ++ ++ sh->nb_refs[L0] = sh->nb_refs[L1] = 0; ++ if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) { ++ int nb_refs; ++ ++ sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active; ++ if (sh->slice_type == HEVC_SLICE_B) ++ sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active; ++ ++ if (get_bits1(gb)) { // num_ref_idx_active_override_flag ++ sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1; ++ if (sh->slice_type == HEVC_SLICE_B) ++ sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1; ++ } ++ if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) { ++ av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n", ++ sh->nb_refs[L0], sh->nb_refs[L1]); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sh->rpl_modification_flag[0] = 0; ++ sh->rpl_modification_flag[1] = 0; ++ nb_refs = ff_hevc_rpi_frame_nb_refs(s); ++ if (!nb_refs) { ++ av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) { ++ sh->rpl_modification_flag[0] = get_bits1(gb); ++ if (sh->rpl_modification_flag[0]) { ++ for (i = 0; i < sh->nb_refs[L0]; i++) ++ sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs)); ++ } ++ ++ if (sh->slice_type == HEVC_SLICE_B) { ++ sh->rpl_modification_flag[1] = get_bits1(gb); ++ if (sh->rpl_modification_flag[1] == 1) ++ for (i = 0; i < sh->nb_refs[L1]; i++) ++ sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs)); ++ } ++ } ++ ++ if (sh->slice_type == HEVC_SLICE_B) ++ sh->mvd_l1_zero_flag = get_bits1(gb); ++ ++ if (s->ps.pps->cabac_init_present_flag) ++ sh->cabac_init_flag = get_bits1(gb); ++ else ++ sh->cabac_init_flag = 0; ++ ++ sh->collocated_ref_idx = 0; ++ if (sh->slice_temporal_mvp_enabled_flag) { ++ sh->collocated_list = L0; ++ if (sh->slice_type == HEVC_SLICE_B) ++ sh->collocated_list = !get_bits1(gb); ++ ++ if (sh->nb_refs[sh->collocated_list] > 1) { ++ sh->collocated_ref_idx = get_ue_golomb_long(gb); ++ if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Invalid collocated_ref_idx: %d.\n", ++ sh->collocated_ref_idx); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ } ++ ++ if ((s->ps.pps->weighted_pred_flag && sh->slice_type == HEVC_SLICE_P) || ++ (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) ++ { ++ if ((ret = pred_weight_table(s, gb)) != 0) ++ return ret; ++ } ++ else ++ { ++ // Give us unit weights ++ default_pred_weight_table(s); ++ } ++ ++ sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb); ++ if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Invalid number of merging MVP candidates: %d.\n", ++ sh->max_num_merge_cand); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ sh->slice_qp_delta = get_se_golomb(gb); ++ ++ if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) { ++ sh->slice_cb_qp_offset = get_se_golomb(gb); ++ sh->slice_cr_qp_offset = get_se_golomb(gb); ++ if (!qp_offset_valid(sh->slice_cb_qp_offset) || ++ !qp_offset_valid(s->ps.pps->cb_qp_offset + sh->slice_cb_qp_offset) || ++ !qp_offset_valid(sh->slice_cr_qp_offset) || ++ !qp_offset_valid(s->ps.pps->cr_qp_offset + sh->slice_cr_qp_offset)) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Bad chroma offset (pps:%d/%d; slice=%d/%d\n", ++ sh->slice_cr_qp_offset, sh->slice_cr_qp_offset, ++ s->ps.pps->cb_qp_offset, s->ps.pps->cr_qp_offset); ++ return AVERROR_INVALIDDATA; ++ } ++ } else ++ { ++ sh->slice_cb_qp_offset = 0; ++ sh->slice_cr_qp_offset = 0; ++ } ++ ++ if (s->ps.pps->chroma_qp_offset_list_enabled_flag) ++ sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb); ++ else ++ sh->cu_chroma_qp_offset_enabled_flag = 0; ++ ++ if (s->ps.pps->deblocking_filter_control_present_flag) { ++ int deblocking_filter_override_flag = 0; ++ ++ if (s->ps.pps->deblocking_filter_override_enabled_flag) ++ deblocking_filter_override_flag = get_bits1(gb); ++ ++ if (deblocking_filter_override_flag) { ++ sh->disable_deblocking_filter_flag = get_bits1(gb); ++ if (!sh->disable_deblocking_filter_flag) { ++ int beta_offset_div2 = get_se_golomb(gb); ++ int tc_offset_div2 = get_se_golomb(gb) ; ++ if (beta_offset_div2 < -6 || beta_offset_div2 > 6 || ++ tc_offset_div2 < -6 || tc_offset_div2 > 6) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Invalid deblock filter offsets: %d, %d\n", ++ beta_offset_div2, tc_offset_div2); ++ return AVERROR_INVALIDDATA; ++ } ++ sh->beta_offset = beta_offset_div2 * 2; ++ sh->tc_offset = tc_offset_div2 * 2; ++ } ++ } else { ++ sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf; ++ sh->beta_offset = s->ps.pps->beta_offset; ++ sh->tc_offset = s->ps.pps->tc_offset; ++ } ++ } else { ++ sh->disable_deblocking_filter_flag = 0; ++ sh->beta_offset = 0; ++ sh->tc_offset = 0; ++ } ++ ++ if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag && ++ (sh->slice_sample_adaptive_offset_flag[0] || ++ sh->slice_sample_adaptive_offset_flag[1] || ++ !sh->disable_deblocking_filter_flag)) { ++ sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb); ++ } else { ++ sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag; ++ } ++ sh->no_dblk_boundary_flags = ++ (sh->slice_loop_filter_across_slices_enabled_flag ? 0 : ++ BOUNDARY_UPPER_SLICE | BOUNDARY_LEFT_SLICE) | ++ (s->ps.pps->loop_filter_across_tiles_enabled_flag ? 0 : ++ BOUNDARY_UPPER_TILE | BOUNDARY_LEFT_TILE); ++ ++ ++ } else if (!s->slice_initialized) { ++ av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sh->num_entry_point_offsets = 0; ++ sh->offload_wpp = 0; ++ sh->offload_tiles = 0; ++ ++ if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) { ++ unsigned num_entry_point_offsets = get_ue_golomb_long(gb); ++ // It would be possible to bound this tighter but this here is simpler ++ if (num_entry_point_offsets > get_bits_left(gb)) { ++ av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sh->num_entry_point_offsets = num_entry_point_offsets; ++ if (sh->num_entry_point_offsets > 0) { ++ int offset_len = get_ue_golomb_long(gb) + 1; ++ ++ if (offset_len < 1 || offset_len > 32) { ++ sh->num_entry_point_offsets = 0; ++ av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n"); ++ return ret; ++ } ++ ++ for (i = 0; i < sh->num_entry_point_offsets; i++) { ++ uint32_t val_minus1 = get_bits_long(gb, offset_len); ++ if (val_minus1 > (1 << 28)) ++ { ++ // We can declare offsets of > 2^28 bad without loss of generality ++ // Will check actual bounds wrt NAL later, but this keeps ++ // the values within bounds we can deal with easily ++ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1); ++ return AVERROR_INVALIDDATA; ++ } ++ sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size ++ } ++ ++ // Do we want to offload this ++ if (s->threads_type != 0) ++ { ++ sh->offload_tiles = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) && ++ s->ps.pps->num_tile_columns > 1; ++ // * We only cope with WPP in a single column ++ // Probably want to deal with that case as tiles rather than WPP anyway ++ // ?? Not actually sure that the main code deals with WPP + multi-col correctly ++ sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag && ++ s->ps.pps->num_tile_columns == 1; ++ } ++ } ++ } ++ ++ if (s->ps.pps->slice_header_extension_present_flag) { ++ unsigned int length = get_ue_golomb_long(gb); ++ if (length*8LL > get_bits_left(gb)) { ++ av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ for (i = 0; i < length; i++) ++ skip_bits(gb, 8); // slice_header_extension_data_byte ++ } ++ ++ // Inferred parameters ++ sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta; ++ if (sh->slice_qp > 51 || ++ sh->slice_qp < -s->ps.sps->qp_bd_offset) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "The slice_qp %d is outside the valid range " ++ "[%d, 51].\n", ++ sh->slice_qp, ++ -s->ps.sps->qp_bd_offset); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (get_bits_left(gb) < 0) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Overread slice header by %d bits\n", -get_bits_left(gb)); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ s->slice_initialized = 1; ++ return 0; ++} ++ ++static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry) ++{ ++ RpiSAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width; ++ int c_idx, i; ++ ++ if (s->sh.slice_sample_adaptive_offset_flag[0] || ++ s->sh.slice_sample_adaptive_offset_flag[1]) { ++ if ((lc->ctb_avail & AVAIL_L) != 0) ++ { ++ const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc); ++ if (sao_merge_left_flag) { ++ *sao = sao[-1]; ++ return; ++ } ++ } ++ if ((lc->ctb_avail & AVAIL_U) != 0) ++ { ++ const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc); ++ if (sao_merge_up_flag) { ++ *sao = sao[-(int)s->ps.sps->ctb_width]; ++ return; ++ } ++ } ++ } ++ ++ for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) { ++ const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma : ++ s->ps.pps->log2_sao_offset_scale_chroma; ++ int offset_abs[4]; ++ char offset_sign[4] = {0}; ++ ++ if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) { ++ sao->type_idx[c_idx] = SAO_NOT_APPLIED; ++ continue; ++ } ++ ++ if (c_idx == 2) { ++ sao->type_idx[2] = sao->type_idx[1]; ++ sao->eo_class[2] = sao->eo_class[1]; ++ } else { ++ sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc); ++ } ++ ++ // ** Could use BY22 here quite plausibly - this is all bypass stuff ++ // though only per CTB so not very timing critical ++ ++ if (sao->type_idx[c_idx] == SAO_NOT_APPLIED) ++ continue; ++ ++ for (i = 0; i < 4; i++) ++ offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc); ++ ++ if (sao->type_idx[c_idx] == SAO_BAND) { ++ for (i = 0; i < 4; i++) { ++ if (offset_abs[i] != 0) ++ offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc); ++ } ++ sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc); ++ } else if (c_idx != 2) { ++ sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc); ++ } ++ ++ // Inferred parameters ++ sao->offset_val[c_idx][0] = 0; ++ for (i = 0; i < 4; i++) { ++ sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale; ++ if (sao->type_idx[c_idx] == SAO_EDGE) { ++ if (i > 1) ++ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1]; ++ } else if (offset_sign[i]) { ++ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1]; ++ } ++ } ++ } ++} ++ ++#if 0 ++static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) { ++ int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx); // 0..4 ++ ++ if (log2_res_scale_abs_plus1 != 0) { ++ int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx); ++ lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) * ++ (1 - 2 * res_scale_sign_flag); ++ } else { ++ lc->tu.res_scale_val = 0; ++ } ++ ++ ++ return 0; ++} ++#endif ++ ++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb) ++{ ++ return jb->intra.cmds + jb->intra.n++; ++} ++ ++#define A0(x, y, U, L, UL, UR, DL) \ ++ [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0)) ++ ++#define A1(x, y, U, L, UL, UR, DL) \ ++ A0((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A0((x) + 1, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A0((x) + 0, (y) + 1, 1, (L), (L), 1, (DL)), A0((x) + 1, (y) + 1, 1, 1, 1, 0, 0 ) ++ ++#define A2(x, y, U, L, UL, UR, DL) \ ++ A1((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A1((x) + 2, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A1((x) + 0, (y) + 2, 1, (L), (L), 1, (DL)), A1((x) + 2, (y) + 2, 1, 1, 1, 0, 0 ) ++ ++#define A3(x, y, U, L, UL, UR, DL) \ ++ A2((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A2((x) + 4, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A2((x) + 0, (y) + 4, 1, (L), (L), 1, (DL)), A2((x) + 4, (y) + 4, 1, 1, 1, 0, 0 ) ++ ++#define A4(x, y, U, L, UL, UR, DL) \ ++ A3((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A3((x) + 8, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A3((x) + 0, (y) + 8, 1, (L), (L), 1, (DL)), A3((x) + 8, (y) + 8, 1, 1, 1, 0, 0 ) ++ ++static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)}; ++ ++unsigned int ff_hevc_rpi_tb_avail_flags( ++ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h) ++{ ++ const unsigned int ctb_mask = ~0U << s->ps.sps->log2_ctb_size; ++ const unsigned int tb_x = x & ~ctb_mask; ++ const unsigned int tb_y = y & ~ctb_mask; ++ const unsigned int ctb_avail = lc->ctb_avail; ++ ++ const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16; ++ ++ unsigned int f = (ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL); ++ ++ // This deals with both the U & L edges ++ if ((tb_x | tb_y) != 0 && (~f & (AVAIL_L | AVAIL_U)) == 0) ++ f |= AVAIL_UL; ++ ++ if (x + w < lc->end_of_ctb_x) ++ f |= (tb_y == 0 ? ctb_avail >> (AVAIL_S_U - AVAIL_S_UR) : tb_f[(w - 1) >> 2]) & AVAIL_UR; ++ else if (tb_y == 0) ++ f |= (ctb_avail & AVAIL_UR); ++#if AVAIL_S_U - AVAIL_S_UR < 0 ++#error Shift problem ++#endif ++ ++ // Never any D if Y beyond eoctb ++ if (y + h < lc->end_of_ctb_y) ++ f |= (tb_x == 0 ? ctb_avail << (AVAIL_S_DL - AVAIL_S_L) : tb_f[((h - 1) >> 2) * 16]) & AVAIL_DL; ++#if AVAIL_S_DL - AVAIL_S_L < 0 ++#error Shift problem ++#endif ++ ++// printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h, ++// lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16], ++// lc->end_of_ctb_x, lc->end_of_ctb_y); ++ ++ return f; ++} ++ ++#undef A0 ++#undef A1 ++#undef A2 ++#undef A3 ++#undef A4 ++ ++static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx, ++ unsigned int avail) ++{ ++ // If rpi_enabled then sand - U & V done on U call ++ if (c_idx <= 1) ++ { ++ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0); ++ cmd->type = RPI_PRED_INTRA + c_idx; ++ cmd->size = log2_trafo_size; ++ cmd->avail = avail; ++ cmd->i_pred.x = x0; ++ cmd->i_pred.y = y0; ++ cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; ++ ++// printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail); ++ } ++} ++ ++#define CBF_CB0_S 0 ++#define CBF_CB1_S 1 // CB1 must be CB0 + 1 ++#define CBF_CR0_S 2 ++#define CBF_CR1_S 3 ++ ++#define CBF_CB0 (1 << CBF_CB0_S) ++#define CBF_CR0 (1 << CBF_CR0_S) ++#define CBF_CB1 (1 << CBF_CB1_S) ++#define CBF_CR1 (1 << CBF_CR1_S) ++ ++// * Only good for chroma_idx == 1 ++static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int log2_cb_size, const unsigned int log2_trafo_size, ++ const unsigned int blk_idx, const int cbf_luma, ++ const unsigned int cbf_chroma) ++{ ++ const unsigned int log2_trafo_size_c = FFMAX(2, log2_trafo_size - 1); ++ const unsigned int x0_c = x0 & ~7; ++ const unsigned int y0_c = y0 & ~7; ++ ++ enum ScanType scan_idx = SCAN_DIAG; ++ enum ScanType scan_idx_c = SCAN_DIAG; ++ ++ if (lc->cu.pred_mode == MODE_INTRA) ++ { ++ const unsigned int trafo_size = 1 << log2_trafo_size; ++ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size); ++ ++ do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, avail); ++ ++ if (log2_trafo_size > 2) ++ do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, avail); ++ else if (blk_idx == 3) ++ do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, ++ ff_hevc_rpi_tb_avail_flags(s, lc, x0_c, y0_c, 8, 8)); ++ ++ if (log2_trafo_size < 4) { ++ if (lc->tu.intra_pred_mode >= 6 && ++ lc->tu.intra_pred_mode <= 14) { ++ scan_idx = SCAN_VERT; ++ } else if (lc->tu.intra_pred_mode >= 22 && ++ lc->tu.intra_pred_mode <= 30) { ++ scan_idx = SCAN_HORIZ; ++ } ++ ++ if (lc->tu.intra_pred_mode_c >= 6 && ++ lc->tu.intra_pred_mode_c <= 14) { ++ scan_idx_c = SCAN_VERT; ++ } else if (lc->tu.intra_pred_mode_c >= 22 && ++ lc->tu.intra_pred_mode_c <= 30) { ++ scan_idx_c = SCAN_HORIZ; ++ } ++ } ++ } ++ ++ if (!cbf_luma && cbf_chroma == 0) ++ return 0; ++ ++ if (lc->tu.is_cu_qp_delta_wanted) ++ { ++ const int qp_delta = ff_hevc_rpi_cu_qp_delta(lc); ++ const unsigned int cb_mask = ~0U << log2_cb_size; ++ ++ if (qp_delta < -(26 + (s->ps.sps->qp_bd_offset >> 1)) || ++ qp_delta > (25 + (s->ps.sps->qp_bd_offset >> 1))) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "The cu_qp_delta %d is outside the valid range " ++ "[%d, %d].\n", ++ qp_delta, ++ -(26 + (s->ps.sps->qp_bd_offset >> 1)), ++ (25 + (s->ps.sps->qp_bd_offset >> 1))); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ lc->tu.is_cu_qp_delta_wanted = 0; ++ lc->tu.cu_qp_delta = qp_delta; ++ ff_hevc_rpi_set_qPy(s, lc, x0 & cb_mask, y0 & cb_mask); ++ } ++ ++ // * Not main profile & untested due to no conform streams ++ if (lc->tu.cu_chroma_qp_offset_wanted && cbf_chroma && ++ !lc->cu.cu_transquant_bypass_flag) { ++ int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc); ++ if (cu_chroma_qp_offset_flag) { ++ int cu_chroma_qp_offset_idx = 0; ++ if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) { ++ cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc); ++ } ++ lc->tu.qp_divmod6[1] += s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx]; ++ lc->tu.qp_divmod6[2] += s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx]; ++ } ++ lc->tu.cu_chroma_qp_offset_wanted = 0; ++ } ++ ++ if (cbf_luma) ++ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0); ++ ++ if (log2_trafo_size > 2 || blk_idx == 3) ++ { ++ if ((cbf_chroma & CBF_CB0) != 0) ++ ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c, ++ log2_trafo_size_c, scan_idx_c, 1); ++ if ((cbf_chroma & CBF_CR0) != 0) ++ ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c, ++ log2_trafo_size_c, scan_idx_c, 2); ++ } ++ ++ return 0; ++} ++ ++static inline void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size) ++{ ++ set_bits(s->is_pcm + (y0 >> 3) * s->ps.sps->pcm_width, x0 >> 3, s->ps.sps->pcm_width, log2_cb_size - 3); ++} ++ ++ ++static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int log2_trafo_size, ++ const unsigned int trafo_depth, const unsigned int blk_idx, ++ const unsigned int cbf_c0) ++{ ++ // When trafo_size == 2 hls_transform_unit uses c0 so put in c1 ++ unsigned int cbf_c1 = cbf_c0; ++ int split_transform_flag; ++ int ret; ++ ++ if (lc->cu.intra_split_flag) { ++ if (trafo_depth == 1) { ++ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[blk_idx]; ++ if (ctx_cfmt(s) == 3) { ++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx]; ++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[blk_idx]; ++ } else { ++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0]; ++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0]; ++ } ++ } ++ } else { ++ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[0]; ++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0]; ++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0]; ++ } ++ ++ if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size && ++ log2_trafo_size > s->ps.sps->log2_min_tb_size && ++ trafo_depth < lc->cu.max_trafo_depth && ++ !(lc->cu.intra_split_flag && trafo_depth == 0)) ++ { ++ split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size); ++ } else { ++ int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 && ++ lc->cu.pred_mode == MODE_INTER && ++ lc->cu.part_mode != PART_2Nx2N && ++ trafo_depth == 0; ++ ++ split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size || ++ (lc->cu.intra_split_flag && trafo_depth == 0) || ++ inter_split; ++ } ++ ++ if (log2_trafo_size > 2 || ctx_cfmt(s) == 3) ++ { ++ const int wants_c1 = ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3); ++ cbf_c1 = 0; ++ ++ if ((cbf_c0 & CBF_CB0) != 0) ++ { ++ cbf_c1 = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB0_S; ++ if (wants_c1) ++ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB1_S; ++ } ++ ++ if ((cbf_c0 & CBF_CR0) != 0) ++ { ++ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR0_S; ++ if (wants_c1) ++ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR1_S; ++ } ++ } ++ ++ if (split_transform_flag) { ++ const int trafo_size_split = 1 << (log2_trafo_size - 1); ++ const int x1 = x0 + trafo_size_split; ++ const int y1 = y0 + trafo_size_split; ++ ++#define SUBDIVIDE(x, y, idx) \ ++do { \ ++ ret = hls_transform_tree(s, lc, x, y, \ ++ log2_trafo_size - 1, trafo_depth + 1, idx, \ ++ cbf_c1); \ ++ if (ret < 0) \ ++ return ret; \ ++} while (0) ++ ++ SUBDIVIDE(x0, y0, 0); ++ SUBDIVIDE(x1, y0, 1); ++ SUBDIVIDE(x0, y1, 2); ++ SUBDIVIDE(x1, y1, 3); ++ ++#undef SUBDIVIDE ++ } else { ++ // If trafo_size == 2 then we should have cbf_c == 0 here but as we can't have ++ // trafo_size == 2 with depth == 0 the issue is moot ++ const int cbf_luma = ((lc->cu.pred_mode != MODE_INTRA && trafo_depth == 0 && cbf_c1 == 0) || ++ ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth)); ++ ++ ret = hls_transform_unit(s, lc, x0, y0, ++ log2_trafo_size + trafo_depth, log2_trafo_size, ++ blk_idx, cbf_luma, cbf_c1); ++ if (ret < 0) ++ return ret; ++ ++ if (!s->sh.disable_deblocking_filter_flag) { ++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size, cbf_luma); ++ } ++ } ++ return 0; ++} ++ ++ ++static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size) ++{ ++ GetBitContext gb; ++ int ret; ++ ++ ret = init_get_bits(&gb, pcm, length); ++ if (ret < 0) ++ return ret; ++ ++ s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0), ++ frame_stride1(s->frame, 0), ++ cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); ++ ++ s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)), ++ s->frame->linesize[1], ++ cb_size >> ctx_hshift(s, 1), ++ cb_size >> ctx_vshift(s, 1), ++ &gb, s->ps.sps->pcm.bit_depth_chroma); ++ ++ return 0; ++} ++ ++ ++// x * 2^(y*2) ++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y) ++{ ++ return x << (y * 2); ++} ++ ++static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size) ++{ ++ // Length in bits ++ const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) + ++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) + ++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2)); ++ ++ const uint8_t * const pcm = ff_hevc_rpi_cabac_skip_bytes(&lc->cc, (length + 7) >> 3); ++ ++ if (!s->sh.disable_deblocking_filter_flag) ++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0); ++ ++ // Copy coeffs ++ { ++ const int blen = (length + 7) >> 3; ++ // Round allocated bytes up to nearest 32 to avoid alignment confusion ++ // Allocation is in int16_t s ++ // As we are only using 1 byte per sample and the coeff buffer allows 2 per ++ // sample this rounding doesn't affect the total size we need to allocate for ++ // the coeff buffer ++ int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1); ++ memcpy(coeffs, pcm, blen); ++ ++ // Our coeff stash assumes that any partially allocated 64byte lump ++ // is zeroed so make that true. ++ { ++ uint8_t * const eopcm = (uint8_t *)coeffs + blen; ++ if ((-(intptr_t)eopcm & 63) != 0) ++ memset(eopcm, 0, -(intptr_t)eopcm & 63); ++ } ++ ++ // Add command ++ { ++ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0); ++ cmd->type = RPI_PRED_I_PCM; ++ cmd->size = log2_cb_size; ++ cmd->i_pcm.src = coeffs; ++ cmd->i_pcm.x = x0; ++ cmd->i_pcm.y = y0; ++ cmd->i_pcm.src_len = length; ++ } ++ return 0; ++ } ++} ++ ++ ++static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref, ++ const MvXY xy, const int y0, const int height) ++{ ++ if (s->threads_type != 0) { ++ const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9); ++ ++ // Progress has to be attached to current job as the actual wait ++ // is in worker_core which can't use lc ++ int16_t *const pr = lc->jb0->progress_req + ref->dpb_no; ++ if (*pr < y) { ++ *pr = y; ++ } ++ } ++} ++ ++static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x0, const int y0, const int nPbW, ++ const int nPbH, ++ HEVCRpiMvField * const mv) ++{ ++ enum InterPredIdc inter_pred_idc = PRED_L0; ++ int mvp_flag; ++ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH); ++ ++ mv->pred_flag = 0; ++ if (s->sh.slice_type == HEVC_SLICE_B) ++ inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH); ++ ++ if (inter_pred_idc != PRED_L1) { ++ MvXY mvd; ++ ++ if (s->sh.nb_refs[L0]) ++ mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]); ++ ++ mv->pred_flag = PF_L0; ++ mvd = ff_hevc_rpi_hls_mvd_coding(lc); ++ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); ++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail, ++ mv, mvp_flag, 0); ++ mv->xy[0] = mvxy_add(mv->xy[0], mvd); ++ } ++ ++ if (inter_pred_idc != PRED_L0) { ++ MvXY mvd = 0; ++ ++ if (s->sh.nb_refs[L1]) ++ mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]); ++ ++ if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI) ++ mvd = ff_hevc_rpi_hls_mvd_coding(lc); ++ ++ mv->pred_flag += PF_L1; ++ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); ++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail, ++ mv, mvp_flag, 1); ++ mv->xy[1] = mvxy_add(mv->xy[1], mvd); ++ } ++} ++ ++ ++static HEVCRpiInterPredQ * ++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn) ++{ ++ HEVCRpiInterPredQ * yp = NULL; ++ HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr; ++ const unsigned int max_fill = ipe->max_fill; ++ unsigned int load = UINT_MAX; ++ ++ for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) { ++ // We will always have enough room between the Qs but if we are ++ // running critically low due to poor scheduling then use fill size ++ // rather than load to determine QPU. This has obvious dire ++ // performance implications but (a) it is better than crashing ++ // and (b) it should (almost) never happen ++ const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base; ++ const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load; ++ ++ if (tload < load) ++ { ++ yp = ypt; ++ load = tload; ++ } ++ } ++ ++ yp->load += load_val; ++ ipe->used_grp = 1; ++ qpu_mc_link_set(yp->qpu_mc_curr, fn); ++ ++ return yp; ++} ++ ++ ++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe) ++{ ++ for (unsigned int i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base; ++ ++ qpu_mc_link_set(q->qpu_mc_curr, q->code_sync); ++ q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(&q->qpu_mc_curr->sync + 1); ++ q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage ++ } ++} ++ ++// Returns 0 on success ++// We no longer check for Q fullness as wew have emergncy code in ctu alloc ++// * However it might be an idea to have some means of spotting that we've used it ++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe) ++{ ++ if (!ipe->used_grp) ++ return 0; ++ ++ if ((ipe->curr += ipe->n_grp) >= ipe->n) ++ { ++ ipe->curr = 0; ++ rpi_inter_pred_sync(ipe); ++ } ++ ipe->used = 1; ++ ipe->used_grp = 0; ++ ++ return 0; ++} ++ ++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe) ++{ ++ unsigned int i; ++ ++ ipe->curr = 0; ++ ipe->used = 0; ++ ipe->used_grp = 0; ++ for (i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ q->qpu_mc_curr = q->qpu_mc_base; ++ q->load = 0; ++ q->last_l0 = NULL; ++ q->last_l1 = NULL; ++ } ++} ++ ++static int rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe, ++ const unsigned int n_max, const unsigned int n_grp, ++ const unsigned int total_size, const unsigned int min_gap) ++{ ++ int rv; ++ ++ memset(ipe, 0, sizeof(*ipe)); ++ if ((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) == NULL) ++ return AVERROR(ENOMEM); ++ ++ ipe->n_grp = n_grp; ++ ipe->min_gap = min_gap; ++ ++ if ((rv = gpu_malloc_cached(total_size, &ipe->gptr)) != 0) ++ av_freep(&ipe->q); ++ return rv; ++} ++ ++ ++#if RPI_QPU_EMU_Y ++#define get_mc_address_y(f) ((f)->data[0]) ++#else ++#define get_mc_address_y(f) get_vc_address_y(f) ++#endif ++#if RPI_QPU_EMU_C ++#define get_mc_address_u(f) ((f)->data[1]) ++#else ++#define get_mc_address_u(f) get_vc_address_u(f) ++#endif ++ ++static inline uint32_t pack_wo_p(const int off, const int mul) ++{ ++ return PACK2(off * 2 + 1, mul); ++} ++ ++static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul) ++{ ++ return PACK2(off0 + off1 + 1, mul); ++} ++ ++ ++static void ++rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb, ++ const int x0, const int y0, ++ const int nPbW, const int nPbH, ++ const MvXY mv_xy, ++ const int weight_mul, ++ const int weight_offset, ++ AVFrame *const src_frame) ++{ ++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); ++ const unsigned int mx = MV_X(mv_xy) & 3; ++ const unsigned int my = MV_Y(mv_xy) & 3; ++ const unsigned int my_mx = (my << 8) | mx; ++ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; ++ const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame); ++ qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off; ++ const uint32_t wo = pack_wo_p(weight_offset, weight_mul); ++ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); ++ ++ if (my_mx == 0) ++ { ++ const int x1 = x0 + (MV_X(mv_xy) >> 2); ++ const int y1 = y0 + (MV_Y(mv_xy) >> 2); ++ const int bh = nPbH; ++ ++ for (int start_x = 0; start_x < nPbW; start_x += 16) ++ { ++ const int bw = FFMIN(nPbW - start_x, 16); ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00; ++ ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; ++ ++ts->y_pred1_x0y0; ++ ++ if (nPbW > 8) ++ ++ts->y_pred1_wgt8; ++ else ++ ++ts->y_pred1_wle8; ++ ++ if (nPbH > 16) ++ ++ts->y_pred1_hgt16; ++ else ++ ++ts->y_pred1_hle16; ++ } ++#endif ++ ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src_vc_address_y; ++ cmd_y->w = bw; ++ cmd_y->h = bh; ++ cmd_y->wo1 = wo; ++ cmd_y->dst_addr = dst_addr + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ } ++ } ++ else ++ { ++ const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3; ++ const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3; ++ const unsigned int bh = nPbH; ++ int start_x = 0; ++ ++#if 1 ++ // As Y-pred operates on two independant 8-wide src blocks we can merge ++ // this pred with the previous one if it the previous one is 8 pel wide, ++ // the same height as the current block, immediately to the left of our ++ // current dest block and mono-pred. ++ ++ qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p; ++ if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr) ++ { ++ const int bw = FFMIN(nPbW, 8); ++ qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1; ++ ++ last_y8_src2->x = x1_m3; ++ last_y8_src2->y = y1_m3; ++ last_y8_src2->base = src_vc_address_y; ++ last_y8_p->w += bw; ++ last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21); ++ last_y8_p->wo2 = wo; ++ ++ jb->last_y8_p = NULL; ++ jb->last_y8_l1 = NULL; ++ start_x = bw; ++#if RPI_TSTATS ++ ++((HEVCRpiStats *)&s->tstats)->y_pred1_y8_merge; ++#endif ++ } ++#endif ++ ++ for (; start_x < nPbW; start_x += 16) ++ { ++ const int bw = FFMIN(nPbW - start_x, 16); ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; ++ if (mx == 0 && my == 0) ++ ++ts->y_pred1_x0y0; ++ else if (mx == 0) ++ ++ts->y_pred1_x0; ++ else if (my == 0) ++ ++ts->y_pred1_y0; ++ else ++ ++ts->y_pred1_xy; ++ ++ if (nPbW > 8) ++ ++ts->y_pred1_wgt8; ++ else ++ ++ts->y_pred1_wle8; ++ ++ if (nPbH > 16) ++ ++ts->y_pred1_hgt16; ++ else ++ ++ts->y_pred1_hle16; ++ } ++#endif ++ src1->x = x1_m3 + start_x; ++ src1->y = y1_m3; ++ src1->base = src_vc_address_y; ++ if (bw <= 8) ++ { ++ src2->x = MC_DUMMY_X; ++ src2->y = MC_DUMMY_Y; ++#if RPI_QPU_EMU_Y ++ src2->base = s->qpu_dummy_frame_emu; ++#else ++ src2->base = s->qpu_dummy_frame_qpu; ++#endif ++ } ++ else ++ { ++ src2->x = x1_m3 + start_x + 8; ++ src2->y = y1_m3; ++ src2->base = src_vc_address_y; ++ } ++ cmd_y->w = bw; ++ cmd_y->h = bh; ++ cmd_y->mymx21 = my2_mx2_my_mx; ++ cmd_y->wo1 = wo; ++ cmd_y->wo2 = wo; ++ cmd_y->dst_addr = dst_addr + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ ++ if (bw == 8) { ++ jb->last_y8_l1 = src2; ++ jb->last_y8_p = cmd_y; ++ } ++ } ++ } ++} ++ ++static void ++rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const int x0, const int y0, ++ const int nPbW, const int nPbH, ++ const struct HEVCRpiMvField *const mv_field, ++ const AVFrame *const src_frame, ++ const AVFrame *const src_frame2) ++{ ++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); ++ const MvXY mv = mv_field->xy[0]; ++ const MvXY mv2 = mv_field->xy[1]; ++ ++ const unsigned int mx = MV_X(mv) & 3; ++ const unsigned int my = MV_Y(mv) & 3; ++ const unsigned int my_mx = (my<<8) | mx; ++ const unsigned int mx2 = MV_X(mv2) & 3; ++ const unsigned int my2 = MV_Y(mv2) & 3; ++ const unsigned int my2_mx2 = (my2<<8) | mx2; ++ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; ++ const unsigned int ref_idx0 = mv_field->ref_idx[0]; ++ const unsigned int ref_idx1 = mv_field->ref_idx[1]; ++ const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]); ++ const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]); ++ ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); ++ qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off; ++ const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame); ++ const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2); ++ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip; ++ ++ if (my2_mx2_my_mx == 0) ++ { ++ const int x1 = x0 + (MV_X(mv) >> 2); ++ const int y1 = y0 + (MV_Y(mv) >> 2); ++ const int x2 = x0 + (MV_X(mv2) >> 2); ++ const int y2 = y0 + (MV_Y(mv2) >> 2); ++ const int bh = nPbH; ++ ++ // Can do chunks a full 16 wide if we don't want the H filter ++ for (int start_x=0; start_x < nPbW; start_x += 16) ++ { ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; ++ ++ts->y_pred2_x0y0; ++ ++ if (nPbH > 16) ++ ++ts->y_pred2_hgt16; ++ else ++ ++ts->y_pred2_hle16; ++ } ++#endif ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src1_base; ++ src2->x = x2 + start_x; ++ src2->y = y2; ++ src2->base = src2_base; ++ cmd_y->w = FFMIN(nPbW - start_x, 16); ++ cmd_y->h = bh; ++ cmd_y->mymx21 = 0; ++ cmd_y->wo1 = wo1; ++ cmd_y->wo2 = wo2; ++ cmd_y->dst_addr = dst + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ } ++ } ++ else ++ { ++ // Filter requires a run-up of 3 ++ const int x1 = x0 + (MV_X(mv) >> 2) - 3; ++ const int y1 = y0 + (MV_Y(mv) >> 2) - 3; ++ const int x2 = x0 + (MV_X(mv2) >> 2) - 3; ++ const int y2 = y0 + (MV_Y(mv2) >> 2) - 3; ++ const int bh = nPbH; ++ ++ for (int start_x=0; start_x < nPbW; start_x += 8) ++ { // B blocks work 8 at a time ++ // B weights aren't doubled as the QPU code does the same ++ // amount of work as it does for P ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; ++ const unsigned int mmx = mx | mx2; ++ const unsigned int mmy = my | my2; ++ if (mmx == 0 && mmy == 0) ++ ++ts->y_pred2_x0y0; ++ else if (mmx == 0) ++ ++ts->y_pred2_x0; ++ else if (mmy == 0) ++ ++ts->y_pred2_y0; ++ else ++ ++ts->y_pred2_xy; ++ ++ if (nPbH > 16) ++ ++ts->y_pred2_hgt16; ++ else ++ ++ts->y_pred2_hle16; ++ } ++#endif ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src1_base; ++ src2->x = x2 + start_x; ++ src2->y = y2; ++ src2->base = src2_base; ++ cmd_y->w = FFMIN(nPbW - start_x, 8); ++ cmd_y->h = bh; ++ cmd_y->mymx21 = my2_mx2_my_mx; ++ cmd_y->wo1 = wo1; ++ cmd_y->wo2 = wo2; ++ cmd_y->dst_addr = dst + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ } ++ } ++} ++ ++// h/v shifts fixed at one as that is all the qasm copes with ++static void ++rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const unsigned int lx, const int x0_c, const int y0_c, ++ const int nPbW_c, const int nPbH_c, ++ const MvXY mv, ++ const int16_t * const c_weights, ++ const int16_t * const c_offsets, ++ AVFrame * const src_frame) ++{ ++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); ++ const int hshift = 1; // = s->ps.sps->hshift[1]; ++ const int vshift = 1; // = s->ps.sps->vshift[1]; ++ ++ const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1; ++ const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame); ++ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)]; ++ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)]; ++ const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]); ++ const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]); ++ qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; ++ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; ++ const unsigned int bh = nPbH_c; ++ const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1; ++ ++ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH) ++ { ++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn); ++ qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p; ++ qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1; ++ qpu_mc_src_t * const last_lx = *plast_lx; ++ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); ++ ++ last_lx->x = x1_c + start_x; ++ last_lx->y = y1_c; ++ last_lx->base = src_base_u; ++ cmd_c->h = bh; ++ cmd_c->w = bw; ++ cmd_c->coeffs_x = x_coeffs; ++ cmd_c->coeffs_y = y_coeffs; ++ cmd_c->wo_u = wo_u; ++ cmd_c->wo_v = wo_v; ++ cmd_c->dst_addr_c = dst_base_u + (start_x << xshl); ++ *plast_lx = &cmd_c->next_src; ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1); ++ } ++ return; ++} ++ ++// h/v shifts fixed at one as that is all the qasm copes with ++static void ++rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const int x0_c, const int y0_c, ++ const int nPbW_c, const int nPbH_c, ++ const struct HEVCRpiMvField * const mv_field, ++ const int16_t * const c_weights, ++ const int16_t * const c_offsets, ++ const int16_t * const c_weights2, ++ const int16_t * const c_offsets2, ++ AVFrame * const src_frame, ++ AVFrame * const src_frame2) ++{ ++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); ++ const int hshift = 1; // s->ps.sps->hshift[1]; ++ const int vshift = 1; // s->ps.sps->vshift[1]; ++ const MvXY mv = mv_field->xy[0]; ++ const MvXY mv2 = mv_field->xy[1]; ++ ++ const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift); ++ const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift); ++ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)]; ++ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector ++ const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1; ++ ++ const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift); ++ const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift); ++ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)]; ++ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector ++ ++ const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1; ++ const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1; ++ ++ const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]); ++ const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]); ++ ++ const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; ++ const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame); ++ const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2); ++ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; ++ const unsigned int bh = nPbH_c; ++ ++ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) ++ { ++ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); ++ ++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx); ++ qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b; ++ qpu_mc_src_t * const src_l0 = cp->last_l0; ++ qpu_mc_src_t * const src_l1 = cp->last_l1; ++ ++ src_l0->x = x1_c + start_x; ++ src_l0->y = y1_c; ++ src_l0->base = src1_base; ++ src_l1->x = x2_c + start_x; ++ src_l1->y = y2_c; ++ src_l1->base = src2_base; ++ ++ u[0].h = bh; ++ u[0].w = bw; ++ u[0].coeffs_x1 = coefs0_x; ++ u[0].coeffs_y1 = coefs0_y; ++ u[0].weight_u1 = c_weights[0]; // Weight L0 U ++ u[0].weight_v1 = c_weights[1]; // Weight L0 V ++ u[0].coeffs_x2 = coefs1_x; ++ u[0].coeffs_y2 = coefs1_y; ++ u[0].wo_u2 = wo_u2; ++ u[0].wo_v2 = wo_v2; ++ u[0].dst_addr_c = dst_base_u + (start_x << xshl); ++ ++ cp->last_l0 = &u[0].next_src1; ++ cp->last_l1 = &u[0].next_src2; ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); ++ } ++} ++ ++ ++static inline void ++col_stash(const HEVCRpiContext * const s, ++ const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0, ++ const HEVCRpiMvField * const mvf) ++{ ++ ColMvField * const col_mvf = s->ref->col_mvf; ++ const unsigned int x = (x0 + 15) >> 4; ++ const unsigned int y = (y0 + 15) >> 4; ++ const unsigned int w = ((x0 + 15 + w0) >> 4) - x; ++ const unsigned int h = ((y0 + 15 + h0) >> 4) - y; ++ ++ if (col_mvf != NULL && w != 0 && h != 0) ++ { ++ // Only record MV from the top left of the 16x16 block ++ ++ const RefPicList * const rpl = s->refPicList; ++ const ColMvField cmv = { ++ .L = { ++ { ++ .poc = (mvf->pred_flag & PF_L0) == 0 ? ++ COL_POC_INTRA : ++ COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]), ++ .xy = mvf->xy[0] ++ }, ++ { ++ .poc = (mvf->pred_flag & PF_L1) == 0 ? ++ COL_POC_INTRA : ++ COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]), ++ .xy = mvf->xy[1] ++ } ++ } ++ }; ++ ++ ColMvField * p = col_mvf + y * s->col_mvf_stride + x; ++ const unsigned int stride = s->col_mvf_stride - w; ++ unsigned int j = h; ++ ++ do ++ { ++ unsigned int k = w; ++ do ++ { ++ *p++ = cmv; ++ } while (--k != 0); ++ p += stride; ++ } while (--j != 0); ++ } ++} ++ ++static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int nPbW, const unsigned int nPbH, ++ const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx) ++{ ++ HEVCRpiJob * const jb = lc->jb0; ++ ++ struct HEVCRpiMvField current_mv = {{0}}; ++ const RefPicList *const refPicList = s->refPicList; ++ const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL; ++ ++ if (lc->cu.pred_mode != MODE_SKIP) ++ lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc); ++ ++ if (lc->cu.pred_mode == MODE_SKIP || lc->pu.merge_flag) { ++ const unsigned int merge_idx = s->sh.max_num_merge_cand <= 1 ? 0 : ++ ff_hevc_rpi_merge_idx_decode(s, lc); ++ ++ ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, ++ partIdx, merge_idx, ¤t_mv); ++ } else { ++ hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, ¤t_mv); ++ } ++ ++ { ++ HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0); ++ unsigned int i, j; ++ ++ for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++) ++ { ++ for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++) ++ p[i] = current_mv; ++ p += MVF_STASH_WIDTH_PU; ++ } ++ } ++ ++ col_stash(s, x0, y0, nPbW, nPbH, ¤t_mv); ++ ++ if (current_mv.pred_flag & PF_L0) { ++ ref0 = refPicList[0].ref[current_mv.ref_idx[0]]; ++ if (!ref0) ++ return; ++ hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH); ++ } ++ if (current_mv.pred_flag & PF_L1) { ++ ref1 = refPicList[1].ref[current_mv.ref_idx[1]]; ++ if (!ref1) ++ return; ++ hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH); ++ } ++ ++ if (current_mv.pred_flag == PF_L0) { ++ const int x0_c = x0 >> ctx_hshift(s, 1); ++ const int y0_c = y0 >> ctx_vshift(s, 1); ++ const int nPbW_c = nPbW >> ctx_hshift(s, 1); ++ const int nPbH_c = nPbH >> ctx_vshift(s, 1); ++ ++ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0], ++ s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]], ++ ref0->frame); ++ ++ if (ctx_cfmt(s) != 0) { ++ rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0], ++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]], ++ ref0->frame); ++ return; ++ } ++ } else if (current_mv.pred_flag == PF_L1) { ++ const int x0_c = x0 >> ctx_hshift(s, 1); ++ const int y0_c = y0 >> ctx_vshift(s, 1); ++ const int nPbW_c = nPbW >> ctx_hshift(s, 1); ++ const int nPbH_c = nPbH >> ctx_vshift(s, 1); ++ ++ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1], ++ s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]], ++ ref1->frame); ++ ++ if (ctx_cfmt(s) != 0) { ++ rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1], ++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]], ++ ref1->frame); ++ return; ++ } ++ } else if (current_mv.pred_flag == PF_BI) { ++ const int x0_c = x0 >> ctx_hshift(s, 1); ++ const int y0_c = y0 >> ctx_vshift(s, 1); ++ const int nPbW_c = nPbW >> ctx_hshift(s, 1); ++ const int nPbH_c = nPbH >> ctx_vshift(s, 1); ++ ++ rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, ¤t_mv, ref0->frame, ref1->frame); ++ ++ if (ctx_cfmt(s) != 0) { ++ rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c, ++ ¤t_mv, ++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], ++ s->sh.chroma_offset_l0[current_mv.ref_idx[0]], ++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], ++ s->sh.chroma_offset_l1[current_mv.ref_idx[1]], ++ ref0->frame, ++ ref1->frame); ++ return; ++ } ++ } ++} ++ ++static void set_ipm(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int log2_cb_size, ++ const unsigned int ipm) ++{ ++ const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE; ++ const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE; ++ ++ { ++ const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE)); ++ set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm); ++ } ++ ++ // If IRAP then everything is Intra & we avoid ever looking at these ++ // stashes so don't bother setting them ++ if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA) ++ { ++ if (s->is_intra != NULL) ++ { ++ set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE); ++ } ++ ++ { ++ HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0); ++ const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1 ++ unsigned int n = size_in_pus; ++ ++ do ++ { ++ memset(p, 0, size_in_pus * sizeof(*p)); ++ p += MVF_STASH_WIDTH_PU; ++ } while (--n != 0); ++ } ++ ++ ++ if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0) ++ { ++ // Only record top left stuff ++ // Blocks should always be alinged on size boundries ++ // so cannot have overflow from a small block ++ ++ ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4); ++ const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4)); ++ const unsigned int stride = s->col_mvf_stride - size_in_col; ++ unsigned int j = size_in_col; ++ ++ do ++ { ++ unsigned int k = size_in_col; ++ do ++ { ++ p->L[0].poc = COL_POC_INTRA; ++ p->L[0].xy = 0; ++ p->L[1].poc = COL_POC_INTRA; ++ p->L[1].xy = 0; ++ ++p; ++ } while (--k != 0); ++ p += stride; ++ } while (--j != 0); ++ } ++ } ++} ++ ++static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int log2_cb_size) ++{ ++ set_ipm(s, lc, x0, y0, log2_cb_size, INTRA_DC); ++} ++ ++ ++/** ++ * 8.4.1 ++ */ ++static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ int x0, int y0, int log2_pu_size, ++ int prev_intra_luma_pred_flag, ++ const unsigned int idx) ++{ ++ const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size); ++ const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE; ++ const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE; ++ ++ // Up does not cross boundries so as we always scan 1 slice-tile-line in an ++ // lc we can just keep 1 CTB lR stashes ++ // Left is reset to DC @ Start of Line/Tile/Slice in fill_job ++ const unsigned int cand_up = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu]; ++ const unsigned int cand_left = lc->ipm_left[yb_pu]; ++ ++ unsigned int intra_pred_mode; ++ unsigned int a, b, c; ++ ++ if (cand_left == cand_up) { ++ if (cand_left < 2) { ++ a = INTRA_PLANAR; ++ b = INTRA_DC; ++ c = INTRA_ANGULAR_26; ++ } else { ++ a = cand_left; ++ b = 2 + ((cand_left - 2 - 1 + 32) & 31); ++ c = 2 + ((cand_left - 2 + 1) & 31); ++ } ++ } else { ++ a = cand_left; ++ b = cand_up; ++ c = (cand_left != INTRA_PLANAR && cand_up != INTRA_PLANAR) ? ++ INTRA_PLANAR : ++ (cand_left != INTRA_DC && cand_up != INTRA_DC) ? ++ INTRA_DC : ++ INTRA_ANGULAR_26; ++ } ++ ++ if (prev_intra_luma_pred_flag) { ++ intra_pred_mode = idx == 0 ? a : idx == 1 ? b : c; ++ } else { ++ // Sort lowest 1st ++ if (a > b) ++ FFSWAP(int, a, b); ++ if (a > c) ++ FFSWAP(int, a, c); ++ if (b > c) ++ FFSWAP(int, b, c); ++ ++ intra_pred_mode = idx; ++ if (intra_pred_mode >= a) ++ intra_pred_mode++; ++ if (intra_pred_mode >= b) ++ intra_pred_mode++; ++ if (intra_pred_mode >= c) ++ intra_pred_mode++; ++ } ++ ++ /* write the intra prediction units into the mv array */ ++ set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode); ++ return intra_pred_mode; ++} ++ ++static const uint8_t tab_mode_idx[] = { ++ 0, 1, 2, 2, 2, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20, ++ 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31}; ++ ++static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int log2_cb_size) ++{ ++ static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 }; ++ uint8_t prev_intra_luma_pred_flag[4]; ++ int split = lc->cu.part_mode == PART_NxN; ++ const unsigned int split_size = (1 << (log2_cb_size - 1)); ++ int chroma_mode; ++ const unsigned int n = split ? 4 : 1; ++ unsigned int i; ++ ++ for (i = 0; i != n; i++) ++ prev_intra_luma_pred_flag[i] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc); ++ ++ for (i = 0; i < n; i++) { ++ // depending on mode idx is mpm or luma_pred_mode ++ const unsigned int idx = prev_intra_luma_pred_flag[i] ? ++ ff_hevc_rpi_mpm_idx_decode(lc) : ++ ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc); ++ ++ lc->pu.intra_pred_mode[i] = ++ luma_intra_pred_mode(s, lc, ++ x0 + ((i & 1) == 0 ? 0 : split_size), ++ y0 + ((i & 2) == 0 ? 0 : split_size), ++ log2_cb_size - split, ++ prev_intra_luma_pred_flag[i], idx); ++ } ++ ++ if (ctx_cfmt(s) == 3) { ++ for (i = 0; i < n; i++) { ++ lc->pu.chroma_mode_c[i] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); ++ if (chroma_mode != 4) { ++ if (lc->pu.intra_pred_mode[i] == intra_chroma_table[chroma_mode]) ++ lc->pu.intra_pred_mode_c[i] = 34; ++ else ++ lc->pu.intra_pred_mode_c[i] = intra_chroma_table[chroma_mode]; ++ } else { ++ lc->pu.intra_pred_mode_c[i] = lc->pu.intra_pred_mode[i]; ++ } ++ } ++ } else if (ctx_cfmt(s) == 2) { ++ int mode_idx; ++ lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); ++ if (chroma_mode != 4) { ++ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode]) ++ mode_idx = 34; ++ else ++ mode_idx = intra_chroma_table[chroma_mode]; ++ } else { ++ mode_idx = lc->pu.intra_pred_mode[0]; ++ } ++ lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx]; ++ } else if (ctx_cfmt(s) != 0) { ++ chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); ++ if (chroma_mode != 4) { ++ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode]) ++ lc->pu.intra_pred_mode_c[0] = 34; ++ else ++ lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode]; ++ } else { ++ lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0]; ++ } ++ } ++} ++ ++static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, const unsigned int log2_cb_size) ++{ ++ const unsigned int cb_size = 1 << log2_cb_size; ++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; ++ const unsigned int min_cb_width = s->ps.sps->min_cb_width; ++ const unsigned int x_cb = x0 >> log2_min_cb_size; ++ const unsigned int y_cb = y0 >> log2_min_cb_size; ++ const unsigned int idx = log2_cb_size - 2; ++ const unsigned int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1; ++ int skip_flag = 0; ++ ++ lc->cu.x = x0; ++ lc->cu.y = y0; ++ lc->cu.x_split = x0; ++ lc->cu.y_split = y0; ++ ++ lc->cu.pred_mode = MODE_INTRA; ++ lc->cu.part_mode = PART_2Nx2N; ++ lc->cu.intra_split_flag = 0; ++ lc->cu.cu_transquant_bypass_flag = 0; ++ lc->pu.intra_pred_mode[0] = 1; ++ lc->pu.intra_pred_mode[1] = 1; ++ lc->pu.intra_pred_mode[2] = 1; ++ lc->pu.intra_pred_mode[3] = 1; ++ ++ if (s->ps.pps->transquant_bypass_enable_flag) { ++ lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc); ++ if (lc->cu.cu_transquant_bypass_flag) ++ set_deblocking_bypass(s, x0, y0, log2_cb_size); ++ } ++ ++ if (s->sh.slice_type != HEVC_SLICE_I) { ++ lc->cu.pred_mode = MODE_INTER; ++ skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb); ++ } ++ ++ if (skip_flag) { ++ lc->cu.pred_mode = MODE_SKIP; ++ ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx); ++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); ++ ++ if (!s->sh.disable_deblocking_filter_flag) ++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0); ++ } else { ++ int pcm_flag = 0; ++ ++ if (s->sh.slice_type != HEVC_SLICE_I) ++ lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc); ++ if (lc->cu.pred_mode != MODE_INTRA || ++ log2_cb_size == s->ps.sps->log2_min_cb_size) { ++ lc->cu.part_mode = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size); ++ lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN && ++ lc->cu.pred_mode == MODE_INTRA; ++ } ++ ++ if (lc->cu.pred_mode == MODE_INTRA) { ++ if (lc->cu.part_mode == PART_2Nx2N && ++ log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size && // 0 if not enabled ++ log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size && ++ ff_hevc_rpi_pcm_flag_decode(lc) != 0) ++ { ++ int ret; ++ pcm_flag = 1; ++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); ++ if ((ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size)) < 0) ++ return ret; ++ ++ if (s->ps.sps->pcm.loop_filter_disable_flag) ++ set_deblocking_bypass(s, x0, y0, log2_cb_size); ++ } else { ++ intra_prediction_unit(s, lc, x0, y0, log2_cb_size); ++ } ++ } else { ++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); ++ switch (lc->cu.part_mode) { ++ case PART_2Nx2N: ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx); ++ break; ++ case PART_2NxN: ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 2, log2_cb_size, 0, idx); ++ lc->cu.y_split = y0 + cb_size / 2; ++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx); ++ break; ++ case PART_Nx2N: ++ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1); ++ lc->cu.x_split = x0 + cb_size / 2; ++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1); ++ break; ++ case PART_2NxnU: ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4, log2_cb_size, 0, idx); ++ lc->cu.y_split = y0 + cb_size / 4; ++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size / 4 * 3, log2_cb_size, 1, idx); ++ break; ++ case PART_2NxnD: ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4 * 3, log2_cb_size, 0, idx); ++ lc->cu.y_split = y0 + cb_size / 4 * 3; ++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4 * 3, cb_size, cb_size / 4, log2_cb_size, 1, idx); ++ break; ++ case PART_nLx2N: ++ hls_prediction_unit(s, lc, x0, y0, cb_size / 4, cb_size, log2_cb_size, 0, idx - 2); ++ lc->cu.x_split = x0 + cb_size / 4; ++ hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2); ++ break; ++ case PART_nRx2N: ++ hls_prediction_unit(s, lc, x0, y0, cb_size / 4 * 3, cb_size, log2_cb_size, 0, idx - 2); ++ lc->cu.x_split = x0 + cb_size / 4 * 3; ++ hls_prediction_unit(s, lc, x0 + cb_size / 4 * 3, y0, cb_size / 4, cb_size, log2_cb_size, 1, idx - 2); ++ break; ++ case PART_NxN: ++ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1); ++ lc->cu.x_split = x0 + cb_size / 2; ++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1); ++ lc->cu.y_split = y0 + cb_size / 2; ++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1); ++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1); ++ break; ++ } ++ } ++ ++ if (!pcm_flag) { ++ int rqt_root_cbf = 1; ++ ++ if (lc->cu.pred_mode != MODE_INTRA && ++ !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) { ++ rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc); ++ } ++ if (rqt_root_cbf) { ++ const unsigned int cbf_c = ctx_cfmt(s) == 0 ? 0 : (CBF_CR0 | CBF_CB0); ++ int ret; ++ ++ lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ? ++ s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag : ++ s->ps.sps->max_transform_hierarchy_depth_inter; ++ // transform_tree does deblock_boundary_strengths ++ ret = hls_transform_tree(s, lc, x0, y0, ++ log2_cb_size, 0, 0, cbf_c); ++ if (ret < 0) ++ return ret; ++ } else { ++ if (!s->sh.disable_deblocking_filter_flag) ++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0); ++ } ++ } ++ } ++ ++ // If the delta is still wanted then we haven't read the delta & therefore need to set qp here ++ if (lc->tu.is_cu_qp_delta_wanted) ++ ff_hevc_rpi_set_qPy(s, lc, x0, y0); ++ ++ if(((x0 + (1<qPy_pred = lc->qp_y; ++ } ++ ++ set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff); ++ ++ set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag); ++ ++ return 0; ++} ++ ++// Returns: ++// < 0 Error ++// 0 More data wanted ++// 1 EoSlice / EoPicture ++static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, ++ const int log2_cb_size, const unsigned int cb_depth) ++{ ++ const int cb_size = 1 << log2_cb_size; ++ int ret; ++ int split_cu; ++ ++ lc->ct_depth = cb_depth; ++ split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size); ++ if (x0 + cb_size <= s->ps.sps->width && ++ y0 + cb_size <= s->ps.sps->height && ++ split_cu) ++ { ++ split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0); ++ } ++ ++ // Qp delta (and offset) need to remain wanted if cb_size < min until ++ // a coded block is found so we still initial state at depth 0 (outside ++ // this fn) and only reset here ++ if (s->ps.pps->cu_qp_delta_enabled_flag && ++ log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size) ++ { ++ lc->tu.is_cu_qp_delta_wanted = 1; ++ lc->tu.cu_qp_delta = 0; ++ } ++ if (s->sh.cu_chroma_qp_offset_enabled_flag && ++ log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size) ++ { ++ lc->tu.cu_chroma_qp_offset_wanted = 1; ++ } ++ ++ lc->tu.qp_divmod6[0] = s->ps.pps->qp_bd_x[0]; ++ lc->tu.qp_divmod6[1] = s->ps.pps->qp_bd_x[1] + s->sh.slice_cb_qp_offset; ++ lc->tu.qp_divmod6[2] = s->ps.pps->qp_bd_x[2] + s->sh.slice_cr_qp_offset; ++ ++ if (split_cu) { ++ int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1; ++ const int cb_size_split = cb_size >> 1; ++ const int x1 = x0 + cb_size_split; ++ const int y1 = y0 + cb_size_split; ++ ++ int more_data = 0; ++ ++ more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1); ++ if (more_data < 0) ++ return more_data; ++ ++ if (more_data && x1 < s->ps.sps->width) { ++ more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1); ++ if (more_data < 0) ++ return more_data; ++ } ++ if (more_data && y1 < s->ps.sps->height) { ++ more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1); ++ if (more_data < 0) ++ return more_data; ++ } ++ if (more_data && x1 < s->ps.sps->width && ++ y1 < s->ps.sps->height) { ++ more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1); ++ if (more_data < 0) ++ return more_data; ++ } ++ ++ if(((x0 + (1<qPy_pred = lc->qp_y; ++ ++ if (more_data) ++ return ((x1 + cb_size_split) < s->ps.sps->width || ++ (y1 + cb_size_split) < s->ps.sps->height); ++ else ++ return 0; ++ } else { ++ ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size); ++ if (ret < 0) ++ return ret; ++ if ((!((x0 + cb_size) % ++ (1 << (s->ps.sps->log2_ctb_size))) || ++ (x0 + cb_size >= s->ps.sps->width)) && ++ (!((y0 + cb_size) % ++ (1 << (s->ps.sps->log2_ctb_size))) || ++ (y0 + cb_size >= s->ps.sps->height))) { ++ int end_of_slice_flag = ff_hevc_rpi_get_cabac_terminate(&lc->cc); ++ return !end_of_slice_flag; ++ } else { ++ return 1; ++ } ++ } ++ ++ return 0; // NEVER ++} ++ ++static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x_ctb, const int y_ctb, const int ctb_addr_ts) ++{ ++ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size; ++ const unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr; // slice_addr = RS addr of start of slice ++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts]; ++ const unsigned int line_w = s->ps.sps->ctb_width; ++ ++ s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr; ++ ++ lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width); ++ lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height); ++ ++ lc->boundary_flags = 0; ++ ++ if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0) ++ lc->boundary_flags |= BOUNDARY_LEFT_TILE; ++ if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1]) ++ lc->boundary_flags |= BOUNDARY_LEFT_SLICE; ++ if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0) ++ lc->boundary_flags |= BOUNDARY_UPPER_TILE; ++ if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w]) ++ lc->boundary_flags |= BOUNDARY_UPPER_SLICE; ++ ++ // Use line width rather than tile width for addr_in_slice test as ++ // addr_in_slice is in raster units ++ ++ lc->ctb_avail = ++ ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) | ++ ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) | ++ ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 && ++ (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) | ++ ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 && ++ (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0); ++ // Down-left never avail at CTB level ++} ++ ++ ++static void rpi_execute_dblk_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ int y = ff_hevc_rpi_hls_filter_blk(s, jb->bounds, ++ (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0); ++ ++ // Signal ++ if (y > 0) { ++ // Cast away const as progress is held in s, but this really shouldn't confuse anything ++ ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1); ++ } ++ ++ // Job done now ++ // ? Move outside this fn ++ job_free(s->jbc, jb); ++} ++ ++// I-pred, transform_and_add for all blocks types done here ++// All ARM ++static void rpi_execute_pred_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ unsigned int i; ++ HEVCRpiIntraPredEnv * const iap = &jb->intra; ++ const HEVCPredCmd *cmd = iap->cmds; ++ ++#if !RPI_WORKER_WAIT_PASS_0 ++ rpi_sem_wait(&jb->sem); ++ rpi_cache_flush_execute(jb->rfe); // Invalidate data set up in pass1 ++#endif ++ ++ for (i = iap->n; i > 0; i--, cmd++) ++ { ++ switch (cmd->type) ++ { ++ case RPI_PRED_INTRA: ++ s->hpc.intra_pred(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size); ++ break; ++ case RPI_PRED_INTRA_C: ++ s->hpc.intra_pred_c(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size); ++ break; ++ case RPI_PRED_ADD_RESIDUAL: ++ s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ break; ++ case RPI_PRED_ADD_DC: ++ s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); ++ break; ++ case RPI_PRED_ADD_RESIDUAL_U: ++ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); ++ break; ++ case RPI_PRED_ADD_RESIDUAL_V: ++ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); ++ break; ++ case RPI_PRED_ADD_RESIDUAL_C: ++ s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ break; ++ case RPI_PRED_ADD_DC_U: ++ case RPI_PRED_ADD_DC_V: ++ s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); ++ break; ++ ++ case RPI_PRED_I_PCM: ++ pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size); ++ break; ++ ++ default: ++ av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type); ++ abort(); ++ } ++ } ++ ++ // Mark done ++ iap->n = 0; ++} ++ ++ ++// Set initial uniform job values & zero ctu_count ++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first) ++{ ++ unsigned int i; ++ HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip; ++ HEVCRpiInterPredEnv *const yipe = &jb->luma_ip; ++ const HEVCRpiSPS * const sps = s->ps.sps; ++ ++ const uint16_t pic_width_y = sps->width; ++ const uint16_t pic_height_y = sps->height; ++ ++ const uint16_t pic_width_c = sps->width >> ctx_hshift(s, 1); ++ const uint16_t pic_height_c = sps->height >> ctx_vshift(s, 1); ++ ++ // We expect the pointer to change if we use another sps ++ if (sps != jb->sps) ++ { ++ worker_pic_free_one(jb); ++ ++ set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma); ++ set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma); ++ ++ { ++ const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH; ++ const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1)); ++ worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma); ++ } ++ ++ jb->sps = sps; ++ } ++ ++ jb->waited = 0; ++ jb->ctu_ts_first = ctu_ts_first; ++ jb->ctu_ts_last = -1; ++ ++ rpi_inter_pred_reset(cipe); ++ for (i = 0; i < cipe->n; i++) { ++ HEVCRpiInterPredQ * const cp = cipe->q + i; ++ qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s; ++ ++ u->next_src1.x = 0; ++ u->next_src1.y = 0; ++ u->next_src1.base = 0; ++ u->pic_cw = pic_width_c; ++ u->pic_ch = pic_height_c; ++ u->stride2 = av_rpi_sand_frame_stride2(s->frame); ++ u->stride1 = av_rpi_sand_frame_stride1(s->frame); ++ cp->last_l0 = &u->next_src1; ++ ++ u->next_fn = 0; ++ u->next_src2.x = 0; ++ u->next_src2.y = 0; ++ u->next_src2.base = 0; ++ cp->last_l1 = &u->next_src2; ++ ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); ++ } ++ ++ rpi_inter_pred_reset(yipe); ++ for (i = 0; i < yipe->n; i++) { ++ HEVCRpiInterPredQ * const yp = yipe->q + i; ++ qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s; ++ ++ y->next_src1.x = 0; ++ y->next_src1.y = 0; ++ y->next_src1.base = 0; ++ y->next_src2.x = 0; ++ y->next_src2.y = 0; ++ y->next_src2.base = 0; ++ y->pic_h = pic_height_y; ++ y->pic_w = pic_width_y; ++ y->stride2 = av_rpi_sand_frame_stride2(s->frame); ++ y->stride1 = av_rpi_sand_frame_stride1(s->frame); ++ y->next_fn = 0; ++ yp->last_l0 = &y->next_src1; ++ yp->last_l1 = &y->next_src2; ++ ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1); ++ } ++ ++ jb->last_y8_p = NULL; ++ jb->last_y8_l1 = NULL; ++ ++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) { ++ jb->progress_req[i] = -1; ++ } ++ ++ worker_pic_reset(&jb->coeffs); ++} ++ ++ ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s, ++ const vpu_qpu_job_h vqj, ++ rpi_cache_flush_env_t * const rfe, ++ HEVCRpiInterPredEnv * const ipe) ++{ ++ unsigned int i; ++ uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS]; ++ unsigned int max_block = 0; ++ ++ if (!ipe->used) { ++ return 0; ++ } ++ ++ if (ipe->curr != 0) { ++ rpi_inter_pred_sync(ipe); ++ } ++ ++ // Add final commands to Q ++ for(i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const yp = ipe->q + i; ++ qpu_mc_src_t *const p0 = yp->last_l0; ++ qpu_mc_src_t *const p1 = yp->last_l1; ++ const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base; ++ ++ if (block_size > max_block) ++ max_block = block_size; ++ ++ qpu_mc_link_set(yp->qpu_mc_curr, yp->code_exit); ++ ++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched ++ p0->x = MC_DUMMY_X; ++ p0->y = MC_DUMMY_Y; ++ p0->base = s->qpu_dummy_frame_qpu; ++ p1->x = MC_DUMMY_X; ++ p1->y = MC_DUMMY_Y; ++ p1->base = s->qpu_dummy_frame_qpu; ++ ++ yp->last_l0 = NULL; ++ yp->last_l1 = NULL; ++ ++ // Add to mailbox list ++ mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm); ++ mail[i][1] = yp->code_setup; ++ } ++ ++ // We don't need invalidate here as the uniforms aren't changed by the QPU ++ // and leaving them in ARM cache avoids (pointless) pre-reads when writing ++ // new values which seems to give us a small performance advantage ++ // ++ // In most cases we will not have a completely packed set of uniforms and as ++ // we have a 2d invalidate we writeback all uniform Qs to the depth of the ++ // fullest ++ rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK, ++ (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block, ++ ipe->n, ipe->max_fill + ipe->min_gap); ++ vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail); ++ ++ return 1; ++} ++#endif ++ ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s, ++ const vpu_qpu_job_h vqj, ++ rpi_cache_flush_env_t * const rfe, ++ HEVCRpiInterPredEnv * const ipe) ++{ ++ unsigned int i; ++ if (!ipe->used) { ++ return 0; ++ } ++ ++ if (ipe->curr != 0) { ++ rpi_inter_pred_sync(ipe); ++ } ++ ++ // Add final commands to Q ++ for(i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const yp = ipe->q + i; ++ qpu_mc_src_t *const p0 = yp->last_l0; ++ qpu_mc_src_t *const p1 = yp->last_l1; ++ ++ yp->qpu_mc_curr->data[-1] = yp->code_exit; ++ ++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched ++ p0->x = MC_DUMMY_X; ++ p0->y = MC_DUMMY_Y; ++ p0->base = s->qpu_dummy_frame_emu; ++ p1->x = MC_DUMMY_X; ++ p1->y = MC_DUMMY_Y; ++ p1->base = s->qpu_dummy_frame_emu; ++ ++ yp->last_l0 = NULL; ++ yp->last_l1 = NULL; ++ } ++ ++ return 1; ++} ++#endif ++ ++ ++#if RPI_QPU_EMU_Y ++#define mc_terminate_add_y mc_terminate_add_emu ++#else ++#define mc_terminate_add_y mc_terminate_add_qpu ++#endif ++#if RPI_QPU_EMU_C ++#define mc_terminate_add_c mc_terminate_add_emu ++#else ++#define mc_terminate_add_c mc_terminate_add_qpu ++#endif ++ ++ ++static void flush_frame(HEVCRpiContext *s,AVFrame *frame) ++{ ++ rpi_cache_buf_t cbuf; ++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf); ++ rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++ rpi_cache_flush_finish(rfe); ++} ++ ++static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first]; ++ const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last]; ++ const unsigned int ctb_width = s->ps.sps->ctb_width; ++ RpiBlk *const bounds = &jb->bounds; ++ av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last); ++ bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size; ++ bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size; ++ bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size; ++ bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size; ++ ++ bounds->w = FFMIN(bounds->w, s->ps.sps->width - bounds->x); ++ bounds->h = FFMIN(bounds->h, s->ps.sps->height - bounds->y); ++} ++ ++#if RPI_PASSES == 2 ++static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ // Perform intra prediction and residual reconstruction ++ rpi_execute_pred_cmds(s, jb); ++ ++ // Perform deblocking for CTBs in this row ++ rpi_execute_dblk_cmds(s, jb); ++} ++#endif ++ ++// Core execution tasks ++static void worker_core(const HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ int pred_y, pred_c; ++ vpu_qpu_job_env_t qvbuf; ++ const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf); ++#if RPI_WORKER_WAIT_PASS_0 ++ int do_wait; ++#endif ++ ++ { ++ const HEVCRpiCoeffsEnv * const cf = &jb->coeffs; ++ if (cf->s[3].n + cf->s[2].n != 0) ++ { ++ const unsigned int csize = sizeof(cf->s[3].buf[0]); ++ const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize; ++ unsigned int n16 = (cf->s[2].n >> 8); ++ unsigned int n32 = (cf->s[3].n >> 10); ++#if RPI_COMPRESS_COEFFS ++ if (cf->s[2].packed) { ++ n16 = n16 | (n16<<16); ++ } else { ++ const unsigned int npack16 = (cf->s[2].packed_n>>8); ++ n16 = n16 | (npack16<<16); ++ } ++ if (cf->s[3].packed) { ++ n32 = n32 | (n32<<16); ++ } else { ++ const unsigned int npack32 = (cf->s[3].packed_n>>10); ++ n32 = n32 | (npack32<<16); ++ } ++#endif ++ vpu_qpu_job_add_vpu(vqj, ++ vpu_get_fn(s->ps.sps->bit_depth), ++ vpu_get_constants(), ++ cf->gptr.vc, ++ n16, ++ cf->gptr.vc + offset32, ++ n32, ++ 0); ++ ++ rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize); ++ rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize); ++ } ++ } ++ ++ pred_c = mc_terminate_add_c(s, vqj, jb->rfe, &jb->chroma_ip); ++ ++// We could take a sync here and try to locally overlap QPU processing with ARM ++// but testing showed a slightly negative benefit with noticable extra complexity ++ ++ pred_y = mc_terminate_add_y(s, vqj, jb->rfe, &jb->luma_ip); ++ ++ // Returns 0 if nothing to do, 1 if sync added ++#if RPI_WORKER_WAIT_PASS_0 ++ do_wait = vpu_qpu_job_add_sync_sem(vqj, &jb->sem); ++#else ++ if (vpu_qpu_job_add_sync_sem(vqj, &jb->sem) == 0) ++ sem_post(&jb->sem); ++#endif ++ ++ rpi_cache_flush_execute(jb->rfe); ++ ++ // Await progress as required ++ // jb->waited will only be clear if we have already tested the progress values ++ // (in worker_submit_job) and found we don't have to wait ++ if (jb->waited) ++ { ++ unsigned int i; ++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) { ++ if (jb->progress_req[i] >= 0) { ++ ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]); ++ } ++ } ++ } ++ ++ vpu_qpu_job_finish(vqj); ++ ++ // We always work on a rectangular block ++ if (pred_y || pred_c) ++ { ++ rpi_cache_flush_add_frame_block(jb->rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE, ++ jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h, ++ ctx_vshift(s, 1), pred_y, pred_c); ++ } ++ ++ // If we have emulated VPU ops - do it here ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ if (av_rpi_is_sand8_frame(s->frame)) ++ { ++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C ++ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip); ++#elif RPI_QPU_EMU_Y ++ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL); ++#else ++ ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip); ++#endif ++ } ++ else ++ { ++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C ++ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip); ++#elif RPI_QPU_EMU_Y ++ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL); ++#else ++ ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip); ++#endif ++ } ++#endif ++ ++#if RPI_WORKER_WAIT_PASS_0 ++ if (do_wait) ++ rpi_sem_wait(&jb->sem); ++ rpi_cache_flush_execute(jb->rfe); ++#endif ++} ++ ++ ++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe) ++{ ++ av_freep(&ipe->q); ++ gpu_free(&ipe->gptr); ++} ++ ++static HEVCRpiJob * job_new(void) ++{ ++ HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob)); ++ ++ if (jb == NULL) ++ return NULL; ++ ++ sem_init(&jb->sem, 0, 0); ++ jb->rfe = rpi_cache_flush_init(&jb->flush_buf); ++ ff_hevc_rpi_progress_init_wait(&jb->progress_wait); ++ ++ jb->intra.n = 0; ++ if ((jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS)) == NULL) ++ goto fail1; ++ ++ // * Sizeof the union structure might be overkill but at the moment it ++ // is correct (it certainly isn't going to be too small) ++ // Set max fill to slack/2 from the end of the Q ++ // If we exceed this in any Q then we will schedule by size (which should ++ // mean that we never use that Q again part from syncs) ++ // * Given how agressive the overflow resonse is we could maybe put the ++ // threshold even nearer the end, but I don't expect us to ever hit ++ // it on any real stream anyway. ++ ++ if (rpi_inter_pred_alloc(&jb->chroma_ip, ++ QPU_N_MAX, QPU_N_GRP, ++ QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t), ++ QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2) != 0) ++ goto fail2; ++ if (rpi_inter_pred_alloc(&jb->luma_ip, ++ QPU_N_MAX, QPU_N_GRP, ++ QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t), ++ QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2) != 0) ++ goto fail3; ++ ++ return jb; ++ ++fail3: ++ rpi_free_inter_pred(&jb->luma_ip); ++fail2: ++ av_freep(&jb->intra.cmds); ++fail1: ++ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait); ++ rpi_cache_flush_finish(jb->rfe); ++ sem_destroy(&jb->sem); ++ return NULL; ++} ++ ++static void job_delete(HEVCRpiJob * const jb) ++{ ++ worker_pic_free_one(jb); ++ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait); ++ rpi_free_inter_pred(&jb->chroma_ip); ++ rpi_free_inter_pred(&jb->luma_ip); ++ av_freep(&jb->intra.cmds); ++ rpi_cache_flush_finish(jb->rfe); // Not really needed - should do nothing ++ sem_destroy(&jb->sem); ++ av_free(jb); ++} ++ ++static void jbg_delete(HEVCRpiJobGlobal * const jbg) ++{ ++ HEVCRpiJob * jb; ++ ++ if (jbg == NULL) ++ return; ++ ++ jb = jbg->free1; ++ while (jb != NULL) ++ { ++ HEVCRpiJob * const jb2 = jb; ++ jb = jb2->next; ++ job_delete(jb2); ++ } ++ ++ pthread_mutex_destroy(&jbg->lock); ++ av_free(jbg); ++} ++ ++static HEVCRpiJobGlobal * jbg_new(unsigned int job_count) ++{ ++ HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal)); ++ if (jbg == NULL) ++ return NULL; ++ ++ pthread_mutex_init(&jbg->lock, NULL); ++ ++ while (job_count-- != 0) ++ { ++ HEVCRpiJob * const jb = job_new(); ++ if (jb == NULL) ++ goto fail; ++ ++ jb->next = jbg->free1; ++ jbg->free1 = jb; ++ } ++ ++ return jbg; ++ ++fail: ++ jbg_delete(jbg); ++ return NULL; ++} ++ ++static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc) ++{ ++ HEVCRpiJobGlobal * jbg; ++ ++ if (jbc == NULL) ++ return; ++ ++ jbg = jbc->jbg; ++ ++ if (jbc->jb1 != NULL) ++ job_delete(jbc->jb1); ++ ++ pthread_mutex_destroy(&jbc->in_lock); ++ sem_destroy(&jbc->sem_out); ++ av_free(jbc); ++ ++ // Deref the global job context ++ if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1) ++ jbg_delete(jbg); ++} ++ ++static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg) ++{ ++ HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl)); ++ ++ if (jbc == NULL) ++ return NULL; ++ ++ jbc->jbg = jbg; ++ atomic_fetch_add(&jbg->ref_count, 1); ++ ++ sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS); ++ pthread_mutex_init(&jbc->in_lock, NULL); ++ ++ if ((jbc->jb1 = job_new()) == NULL) ++ goto fail; ++ jbc->jb1->jbc_local = jbc; ++ ++ return jbc; ++ ++fail: ++ rpi_job_ctl_delete(jbc); ++ return NULL; ++} ++ ++ ++ ++static av_cold void hevc_init_worker(HEVCRpiContext * const s) ++{ ++#if RPI_PASSES == 2 ++ pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1); ++#elif RPI_PASSES == 3 ++ pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2); ++ pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1); ++#else ++#error Passes confused ++#endif ++ pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0); ++ ++ pass_queues_start_all(s); ++} ++ ++static av_cold void hevc_exit_worker(HEVCRpiContext *s) ++{ ++ pass_queues_term_all(s); ++ ++ pass_queues_kill_all(s); ++ ++ rpi_job_ctl_delete(s->jbc); ++ s->jbc = NULL; ++} ++ ++ ++static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc) ++{ ++ const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; ++ const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns; ++ const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts]; ++ ++ // Check for obvious disasters ++ if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) { ++ av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ // If dependant then ctb_addr_ts != 0 from previous check ++ if (s->sh.dependent_slice_segment_flag) { ++ int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1]; ++ if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) { ++ av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ if (!s->ps.pps->entropy_coding_sync_enabled_flag && ++ tile_id + s->sh.num_entry_point_offsets >= tiles) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ // Tiled stuff must start at start of tile if it has multiple entry points ++ if (!s->ps.pps->entropy_coding_sync_enabled_flag && ++ s->sh.num_entry_point_offsets != 0 && ++ ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id]) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ ff_hevc_rpi_cabac_init_decoder(lc); ++ ++ // Setup any required decode vars ++ lc->cabac_init_req = !s->sh.dependent_slice_segment_flag; ++ ++// printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot); ++ lc->qp_y = s->sh.slice_qp; ++ ++ // General setup ++ lc->bt_line_no = 0; ++ lc->ts = ctb_addr_ts; ++ return 0; ++} ++ ++static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal) ++{ ++ const GetBitContext * const gb = &s->HEVClc->gb; ++ RpiSliceHeader * const sh = &s->sh; ++ int i, j; ++ ++ const unsigned int length = nal->size; ++ unsigned int offset = ((gb->index) >> 3) + 1; // We have a bit & align still to come = +1 byte ++ unsigned int cmpt; ++ unsigned int startheader; ++ ++ if (sh->num_entry_point_offsets == 0) { ++ s->data = NULL; ++ return 0; ++ } ++ ++ // offset in slice header includes emulation prevention bytes. ++ // Unfortunately those have been removed by the time we get here so we ++ // have to compensate. The nal layer keeps a track of where they were. ++ for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) { ++ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) { ++ startheader--; ++ cmpt++; ++ } ++ } ++ ++ for (i = 1; i < sh->num_entry_point_offsets; i++) { ++ offset += (sh->entry_point_offset[i - 1] - cmpt); ++ for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) { ++ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) { ++ startheader--; ++ cmpt++; ++ } ++ } ++ if (sh->entry_point_offset[i] <= cmpt) { ++ av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ sh->size[i - 1] = sh->entry_point_offset[i] - cmpt; ++ sh->offset[i - 1] = offset; ++ } ++ ++ offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt; ++ if (length < offset) { ++ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ sh->size[sh->num_entry_point_offsets - 1] = length - offset; ++ sh->offset[sh->num_entry_point_offsets - 1] = offset; ++ ++ // Remember data start pointer as we won't have nal later ++ s->data = nal->data; ++ return 0; ++} ++ ++ ++// Return ++// < 0 Error ++// 0 OK ++// ++// jb->ctu_ts_last < 0 Job still filling ++// jb->ctu_ts_last >= 0 Job ready ++ ++static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks) ++{ ++ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size; ++ const unsigned int ctb_size = (1 << log2_ctb_size); ++ HEVCRpiJob * const jb = lc->jb0; ++ int more_data = 1; ++ unsigned int ctb_addr_ts = lc->ts; ++ unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size; ++ const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size; ++ ++ lc->unit_done = 0; ++ ++ while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) ++ { ++ int q_full; ++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts]; ++ ++ hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts); ++ ++ ff_hevc_rpi_cabac_init(s, lc, ctb_flags); ++ ++ hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size); ++ ++ s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset; ++ s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset; ++ s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag; ++ ++ // Zap stashes if navail ++ if ((lc->ctb_avail & AVAIL_U) == 0) ++ zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3); ++ if ((lc->ctb_avail & AVAIL_L) == 0) ++ { ++ memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE); ++ zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3); ++ } ++#if MVF_STASH_WIDTH > 64 ++ // Restore left mvf stash at start of tile if not at start of line ++ if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap) ++ { ++ unsigned int i; ++ HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0); ++ const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE); ++ for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i) ++ { ++ *dst = *src++; ++ dst += MVF_STASH_WIDTH_PU; ++ } ++ } ++#endif ++ ++ // Set initial tu states ++ lc->tu.cu_qp_delta = 0; ++ lc->tu.is_cu_qp_delta_wanted = 0; ++ lc->tu.cu_chroma_qp_offset_wanted = 0; ++ ++ // Decode ++ more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0); ++ ++ if (ff_hevc_rpi_cabac_overflow(lc)) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n "); ++ more_data = AVERROR_INVALIDDATA; ++ } ++ ++ if (more_data < 0) { ++ s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN; // Mark slice as broken ++ return more_data; ++ } ++ ++ if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 || ++ (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0))) ++ { ++ if (ff_hevc_rpi_get_cabac_terminate(&lc->cc) < 0 || ++ ff_hevc_rpi_cabac_skip_bytes(&lc->cc, 0) == NULL) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n "); ++ return -1; ++ } ++ } ++ ++ // --- Post CTB processing ++ ++ // Stash rpl top/left for deblock that needs to remember such things cross-slice ++ s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList; ++ s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList; ++ ++ if (!s->is_irap) ++ { ++ // Copy MVF up to up-left & stash to up ++ { ++ const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1); ++ HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE); ++ ++ // printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst); ++ ++ lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE]; ++ memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE); ++ } ++ // Stash sideways if end of tile line but not end of line (no point) ++ // ** Could/should do this @ end of fn ++#if MVF_STASH_WIDTH > 64 ++ if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL) ++#endif ++ { ++ unsigned int i; ++ const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0); ++ HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE); ++ for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i) ++ { ++ *dst++ = *src; ++ src += MVF_STASH_WIDTH_PU; ++ } ++ } ++ } ++ ++ if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0) ++ ff_hevc_rpi_save_states(s, lc); ++ ++ // Report progress so we can use our MVs in other frames ++ if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0) ++ ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1); ++ ++ // End of line || End of tile line || End of tile ++ // (EoL covers end of frame for our purposes here) ++ q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0); ++ ++ // Allocate QPU chunks on fixed size 64 pel boundries rather than ++ // whatever ctb_size is today. ++ // * We might quite like to continue to 64 pel vertical too but that ++ // currently confuses WPP ++ if (((x_ctb + ctb_size) & 63) == 0 || q_full) ++ { ++ int overflow = 0; ++ if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0) ++ overflow = 1; ++ if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0) ++ overflow = 1; ++ if (overflow) ++ { ++ // * This is very annoying (and slow) to cope with in WPP so ++ // we treat it as an error there (no known stream triggers this ++ // with the current buffer sizes). Non-wpp should cope fine. ++ av_log(s->avctx, AV_LOG_WARNING, "%s: Q full before EoL\n", __func__); ++ q_full = 1; ++ } ++ } ++ ++ // Inc TS to next. ++ ctb_addr_ts++; ++ ctb_addr_rs++; ++ x_ctb += ctb_size; ++ ++ if (q_full) ++ { ++ // Do job ++ // Prep for submission ++ jb->ctu_ts_last = ctb_addr_ts - 1; // Was pre-inced ++ job_gen_bounds(s, jb); ++ break; ++ } ++ ++ // If max_blocks started as 0 then this will never be true ++ if (--max_blocks == 0) ++ break; ++ } ++ ++ lc->unit_done = (more_data <= 0); ++ lc->ts = ctb_addr_ts; ++ return 0; ++} ++ ++static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n) ++{ ++ lc->context = s; ++ lc->jb0 = NULL; ++ lc->lc_n = n; ++ lc->bt_terminate = 0; ++ lc->bt_psem_out = NULL; ++ sem_init(&lc->bt_sem_in, 0, 0); ++} ++ ++#define TRACE_WPP 0 ++#if RPI_EXTRA_BIT_THREADS > 0 ++static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts) ++{ ++ unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts]; ++ return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]]; ++} ++ ++// Move local context parameters from an aux bit thread back to the main ++// thread at the end of a slice as processing is going to continue there. ++static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep) ++{ ++ if (src_lc == dst_lc) { ++ return; ++ } ++ ++ // Move the job ++ // We will still have an active job if the final line terminates early ++ // Dest should always be null by now ++ av_assert1(dst_lc->jb0 == NULL); ++ dst_lc->jb0 = src_lc->jb0; ++ src_lc->jb0 = NULL; ++ ++ // Always need to store where we are in the bitstream ++ dst_lc->ts = src_lc->ts; ++ dst_lc->gb = src_lc->gb; ++ // Cabac init request will be built at start of next slice ++ ++ // Need to store context if we might have a dependent seg ++ if (is_dep) ++ { ++ dst_lc->qPy_pred = src_lc->qPy_pred; ++ memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left)); ++ memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state)); ++ memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff)); ++ } ++} ++ ++static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc) ++{ ++ rpi_sem_wait(&lc->bt_sem_in); ++ return lc->bt_terminate; ++} ++ ++// Do one WPP line ++// Will not work correctly over horizontal tile boundries - vertical should be OK ++static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first) ++{ ++ const int is_tile = lc->bt_is_tile; ++ const unsigned int tile_id = s->ps.pps->tile_id[lc->ts]; ++ const unsigned int line = lc->bt_line_no; ++ const unsigned int line_inc = lc->bt_line_inc; ++ const int is_last = (line >= lc->bt_last_line); ++ ++ const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width); ++ const unsigned int ts_next = ++ line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ? ++ INT_MAX : ++ is_tile ? ++ s->ps.pps->tile_pos_ts[tile_id + line_inc] : ++ lc->ts + lc->bt_line_width * line_inc; ++ // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work) ++ const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2; ++ unsigned int ts_prev; ++ int loop_n = 0; ++ int err = 0; ++ ++ av_assert1(line <= s->sh.num_entry_point_offsets); ++ ++#if TRACE_WPP ++ printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__, ++ lc->lc_n, is_tile ? "Tile" : "WPP", tile_id, ++ line, lc->bt_last_line, s->sh.num_entry_point_offsets, ++ lc->ts, ts_eol, ts_next, partial_size, lc->jb0); ++#endif ++ if (line != 0) ++ { ++ const uint8_t * const data = s->data + s->sh.offset[line - 1]; ++ const unsigned int len = s->sh.size[line - 1]; ++ if ((err = init_get_bits8(&lc->gb, data, len)) < 0) ++ return err; ++ ++ ff_init_cabac_decoder(&lc->cc, data, len); ++ } ++ ++ // We should never be processing a dependent slice here so reset is good ++ // ?? These probably shouldn't be needed (as they should be set by later ++ // logic) but do seem to be required ++ lc->qp_y = s->sh.slice_qp; ++ ++ do ++ { ++ if (!is_last && loop_n > 1) { ++#if TRACE_WPP ++ printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out); ++#endif ++ sem_post(lc->bt_psem_out); ++ } ++ // The wait for loop_n == 0 has been done in bit_thread ++ if (!is_first && loop_n != 0) ++ { ++#if TRACE_WPP ++ printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in); ++#endif ++ if (wait_bt_sem_in(lc) != 0) ++ return AVERROR_EXIT; ++ } ++ ++#if TRACE_WPP ++ { ++ int n; ++ sem_getvalue(&lc->bt_sem_in, &n); ++ printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in); ++ } ++#endif ++ ++ ts_prev = lc->ts; ++ ++ // If we have had an error - do no further decode but do continue ++ // moving signals around so the other threads continue to operate ++ // correctly (or at least as correctly as they can with this line missing) ++ // ++ // Errors in WPP/Tile are less fatal than normal as we have a good idea ++ // of how to restart on the next line so there is no need to give up totally ++ if (err != 0) ++ { ++ lc->unit_done = 0; ++ lc->ts += partial_size; ++ } ++ else ++ { ++ worker_pass0_ready(s, lc); ++ ++ if ((err = fill_job(s, lc, partial_size)) < 0 || ++ (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done))) ++ { ++ if (err == 0) { ++ av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n"); ++ err = AVERROR_INVALIDDATA; ++ } ++ worker_free(s, lc); ++ lc->ts = ts_prev + partial_size; // Pretend we did all that ++ lc->unit_done = 0; ++ } ++ else if (is_tile) ++ { ++ worker_submit_job(s, lc); ++ } ++ } ++ ++ ++loop_n; ++ } while (lc->ts < ts_eol && !lc->unit_done); ++ ++ // If we are on the last line & we didn't get a whole line we must wait for ++ // and sink the sem_posts from the line above / tile to the left. ++ while ((ts_prev += partial_size) < ts_eol) ++ { ++#if TRACE_WPP ++ printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in); ++#endif ++ if (wait_bt_sem_in(lc) != 0) ++ return AVERROR_EXIT; ++ } ++ ++ lc->bt_line_no += line_inc; ++ ++ if (!is_tile && err == 0) ++ worker_submit_job(s, lc); ++ ++ if (!is_last) { ++ lc->ts = ts_next; ++ ++#if TRACE_WPP ++ printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out); ++#endif ++ sem_post(lc->bt_psem_out); ++ if (loop_n > 1) { ++#if TRACE_WPP ++ printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out); ++#endif ++ sem_post(lc->bt_psem_out); ++ } ++ } ++ else ++ { ++ movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag); // * & not EoT ++#if MVF_STASH_WIDTH > 64 ++ // Horrid calculations to work out what we want but luckily this should almost never execute ++ // **** Move to movlc ++ if (!s->is_irap) ++ { ++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts]; ++ if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf ++ { ++ const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1; ++ unsigned int i; ++ const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)); ++ HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)); ++ ++ for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i) ++ { ++ *d_mvf = *s_mvf; ++ d_mvf += MVF_STASH_WIDTH_PU; ++ s_mvf += MVF_STASH_WIDTH_PU; ++ } ++ ++ } ++ } ++#endif ++ // When all done poke the thread 0 sem_in one final time ++#if TRACE_WPP ++ printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in); ++#endif ++ sem_post(&s->HEVClcList[0]->bt_sem_in); ++ } ++ ++#if TRACE_WPP ++ printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag); ++#endif ++ return err; ++} ++ ++static void wpp_setup_lcs(HEVCRpiContext * const s) ++{ ++ unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; ++ const unsigned int line_width = line_ts_width(s, ts); ++ ++ for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i) ++ { ++ HEVCRpiLocalContext * const lc = s->HEVClcList[i]; ++ lc->ts = ts; ++ lc->bt_is_tile = 0; ++ lc->bt_line_no = i; ++ lc->bt_line_width = line_width; ++ lc->bt_last_line = s->sh.num_entry_point_offsets; ++ lc->bt_line_inc = RPI_BIT_THREADS; ++ ts += line_width; ++ } ++} ++ ++ ++// Can only process tile single row at once ++static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row) ++{ ++ const HEVCRpiPPS * const pps = s->ps.pps; ++ const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; ++ const unsigned int tile0 = pps->tile_id[ts0]; ++ const unsigned int col0 = tile0 % pps->num_tile_columns; ++ ++ const unsigned int col = (slice_row == 0) ? col0 : 0; ++ unsigned int line = slice_row * pps->num_tile_columns - col0 + col; ++ const unsigned int last_line = FFMIN( ++ line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets); ++ ++ const unsigned int par = ++ FFMIN(RPI_BIT_THREADS, last_line + 1 - line); ++#if TRACE_WPP ++ printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row, ++ pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line); ++#endif ++ for (unsigned int i = 0; i != par; ++i, ++line) ++ { ++ HEVCRpiLocalContext * const lc = s->HEVClcList[i]; ++ const unsigned int tile = tile0 + line; ++ ++ lc->ts = pps->tile_pos_ts[tile]; ++ lc->bt_line_no = line; ++ lc->bt_is_tile = 1; ++ lc->bt_line_width = line_ts_width(s, lc->ts); ++ lc->bt_last_line = last_line; ++ lc->bt_line_inc = par; ++ } ++} ++ ++ ++static void * bit_thread(void * v) ++{ ++ HEVCRpiLocalContext * const lc = v; ++ HEVCRpiContext *const s = lc->context; ++ ++ while (wait_bt_sem_in(lc) == 0) ++ { ++ int err; ++ ++ if ((err = rpi_run_one_line(s, lc, 0)) < 0) { // Never first tile/wpp ++ if (lc->bt_terminate) { ++ av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__); ++ break; ++ } ++ av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err); ++ } ++ } ++ ++ return NULL; ++} ++ ++static int bit_threads_start(HEVCRpiContext * const s) ++{ ++ if (s->bt_started) ++ return 0; ++ ++ for (int i = 1; i < RPI_BIT_THREADS; ++i) ++ { ++ // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS] ++ if (s->HEVClcList[i] == NULL) { ++ if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL) ++ return -1; ++ } ++ ++ bt_lc_init(s, s->HEVClcList[i], i); ++ job_lc_init(s->HEVClcList[i]); ++ } ++ ++ // Link the sems in a circle ++ for (int i = 0; i < RPI_BIT_THREADS - 1; ++i) ++ s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in; ++ s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in; ++ ++ // Init all lc before starting any threads ++ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i) ++ { ++ if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0) ++ return -1; ++ } ++ ++ s->bt_started = 1; ++ return 0; ++} ++ ++static int bit_threads_kill(HEVCRpiContext * const s) ++{ ++ if (!s->bt_started) ++ return 0; ++ s->bt_started = 0; ++ ++ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i) ++ { ++ HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1]; ++ if (lc == NULL) ++ break; ++ ++ lc->bt_terminate = 1; ++ sem_post(&lc->bt_sem_in); ++ pthread_join(s->bit_threads[i], NULL); ++ ++ sem_destroy(&lc->bt_sem_in); ++ job_lc_kill(lc); ++ } ++ return 0; ++} ++#endif ++ ++ ++// If we are at EoT and the row is shorter than the number of jobs ++// we can Q we have to wait for it finish otherwise we risk cache/QPU ++// disasters ++static inline int tile_needs_wait(const HEVCRpiContext * const s, const int n) ++{ ++ return ++ s->ps.pps->tile_wpp_inter_disable >= 2 && ++ s->sh.slice_type != HEVC_SLICE_I && ++ n >= 0 && ++ (s->ps.pps->ctb_ts_flags[n] & (CTB_TS_FLAGS_EOT | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOT; ++} ++ ++static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread) ++{ ++ HEVCRpiContext * const s = avctxt->priv_data; ++ HEVCRpiLocalContext * const lc = s->HEVClc; ++ int err; ++ ++ // Start of slice ++ if ((err = slice_start(s, lc)) != 0) ++ return err; ++ ++#if RPI_EXTRA_BIT_THREADS > 0 ++ ++ if (s->sh.offload_tiles) ++ { ++ unsigned int slice_row = 0; ++ ++#if TRACE_WPP ++ printf("%s: Do Tiles\n", __func__); ++#endif ++ // Generate & start extra bit threads if they aren't already running ++ bit_threads_start(s); ++ ++ do ++ { ++ // Reset lc lines etc. ++ tile_one_row_setup_lcs(s, slice_row); ++ ++#if TRACE_WPP ++ printf("%s: Row %d: Do 1st: line=%d/%d/%d\n", ++ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets); ++#endif ++ ++ rpi_run_one_line(s, lc, 1); // Kicks off the other threads ++#if TRACE_WPP ++ printf("%s: Row %d: Done 1st: line=%d/%d/%d\n", ++ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets); ++#endif ++ ++ while (lc->bt_line_no <= lc->bt_last_line) { ++ rpi_sem_wait(&lc->bt_sem_in); ++ rpi_run_one_line(s, lc, 0); ++ } ++#if TRACE_WPP ++ printf("%s: Done body\n", __func__); ++#endif ++ ++ // Wait for everything else to finish ++ rpi_sem_wait(&lc->bt_sem_in); ++ ++ ++slice_row; ++ } while (lc->bt_last_line < s->sh.num_entry_point_offsets); ++ ++ ++#if TRACE_WPP ++ printf("%s: Done wait: ts=%d\n", __func__, lc->ts); ++#endif ++ } ++ else if (s->sh.offload_wpp) ++ { ++#if TRACE_WPP ++ printf("%s: Do WPP\n", __func__); ++#endif ++ // Generate & start extra bit threads if they aren't already running ++ bit_threads_start(s); ++ ++ // Reset lc lines etc. ++ wpp_setup_lcs(s); ++ ++ rpi_run_one_line(s, lc, 1); // Kicks off the other threads ++#if TRACE_WPP ++ printf("%s: Done 1st\n", __func__); ++#endif ++ ++ while (lc->bt_line_no <= s->sh.num_entry_point_offsets) { ++ rpi_sem_wait(&lc->bt_sem_in); ++ rpi_run_one_line(s, lc, 0); ++ } ++#if TRACE_WPP ++ printf("%s: Done body\n", __func__); ++#endif ++ ++ // Wait for everything else to finish ++ rpi_sem_wait(&lc->bt_sem_in); ++ ++#if TRACE_WPP ++ printf("%s: Done wait: ts=%d\n", __func__, lc->ts); ++#endif ++ } ++ else ++#endif ++ { ++#if TRACE_WPP ++ printf("%s: Single start: ts=%d\n", __func__, lc->ts); ++#endif ++ // Single bit thread ++ do { ++ // Make sure we have space to prepare the next job ++ worker_pass0_ready(s, lc); ++ ++ if ((err = fill_job(s, lc, 0)) < 0) ++ goto fail; ++ ++ worker_submit_job(s, lc); ++ ++ if (tile_needs_wait(s, lc->ts - 1)) ++ worker_wait(s, lc); ++ ++ } while (!lc->unit_done); ++ ++#if TRACE_WPP ++ printf("%s: Single end: ts=%d\n", __func__, lc->ts); ++#endif ++ } ++ ++ // If we have reached the end of the frame or ++ // then wait for the worker to finish all its jobs ++ if (lc->ts >= s->ps.sps->ctb_size) ++ worker_wait(s, lc); ++ ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ ++ printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n", ++ ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0, ++ ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge, ++ ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0, ++ ts->y_pred2_hgt16, ts->y_pred2_hle16); ++ memset(ts, 0, sizeof(*ts)); ++ } ++#endif ++ ++ return lc->ts; ++ ++fail: ++ // Cleanup ++ av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err); ++ // Free our job & wait for temination ++ worker_free(s, lc); ++ worker_wait(s, lc); ++ return err; ++} ++ ++ ++static void set_no_backward_pred(HEVCRpiContext * const s) ++{ ++ int i, j; ++ const RefPicList *const refPicList = s->refPicList; ++ ++ s->no_backward_pred_flag = 0; ++ if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag) ++ return; ++ ++ for (j = 0; j < 2; j++) { ++ for (i = 0; i < refPicList[j].nb_refs; i++) { ++ if (refPicList[j].list[i] > s->poc) { ++ s->no_backward_pred_flag = 1; ++ return; ++ } ++ } ++ } ++} ++ ++static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal) ++{ ++ int err; ++ if ((err = gen_entry_points(s, nal)) < 0) ++ return err; ++ ++ set_no_backward_pred(s); ++ ++ return rpi_decode_entry(s->avctx, NULL); ++} ++ ++static int set_side_data(HEVCRpiContext *s) ++{ ++ AVFrame *out = s->ref->frame; ++ ++ if (s->sei.frame_packing.present && ++ s->sei.frame_packing.arrangement_type >= 3 && ++ s->sei.frame_packing.arrangement_type <= 5 && ++ s->sei.frame_packing.content_interpretation_type > 0 && ++ s->sei.frame_packing.content_interpretation_type < 3) { ++ AVStereo3D *stereo = av_stereo3d_create_side_data(out); ++ if (!stereo) ++ return AVERROR(ENOMEM); ++ ++ switch (s->sei.frame_packing.arrangement_type) { ++ case 3: ++ if (s->sei.frame_packing.quincunx_subsampling) ++ stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX; ++ else ++ stereo->type = AV_STEREO3D_SIDEBYSIDE; ++ break; ++ case 4: ++ stereo->type = AV_STEREO3D_TOPBOTTOM; ++ break; ++ case 5: ++ stereo->type = AV_STEREO3D_FRAMESEQUENCE; ++ break; ++ } ++ ++ if (s->sei.frame_packing.content_interpretation_type == 2) ++ stereo->flags = AV_STEREO3D_FLAG_INVERT; ++ ++ if (s->sei.frame_packing.arrangement_type == 5) { ++ if (s->sei.frame_packing.current_frame_is_frame0_flag) ++ stereo->view = AV_STEREO3D_VIEW_LEFT; ++ else ++ stereo->view = AV_STEREO3D_VIEW_RIGHT; ++ } ++ } ++ ++ if (s->sei.display_orientation.present && ++ (s->sei.display_orientation.anticlockwise_rotation || ++ s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) { ++ double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16); ++ AVFrameSideData *rotation = av_frame_new_side_data(out, ++ AV_FRAME_DATA_DISPLAYMATRIX, ++ sizeof(int32_t) * 9); ++ if (!rotation) ++ return AVERROR(ENOMEM); ++ ++ av_display_rotation_set((int32_t *)rotation->data, angle); ++ av_display_matrix_flip((int32_t *)rotation->data, ++ s->sei.display_orientation.hflip, ++ s->sei.display_orientation.vflip); ++ } ++ ++ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1 ++ // so the side data persists for the entire coded video sequence. ++ if (s->sei.mastering_display.present > 0 && ++ IS_IRAP(s) && s->no_rasl_output_flag) { ++ s->sei.mastering_display.present--; ++ } ++ if (s->sei.mastering_display.present) { ++ // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b ++ const int mapping[3] = {2, 0, 1}; ++ const int chroma_den = 50000; ++ const int luma_den = 10000; ++ int i; ++ AVMasteringDisplayMetadata *metadata = ++ av_mastering_display_metadata_create_side_data(out); ++ if (!metadata) ++ return AVERROR(ENOMEM); ++ ++ for (i = 0; i < 3; i++) { ++ const int j = mapping[i]; ++ metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0]; ++ metadata->display_primaries[i][0].den = chroma_den; ++ metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1]; ++ metadata->display_primaries[i][1].den = chroma_den; ++ } ++ metadata->white_point[0].num = s->sei.mastering_display.white_point[0]; ++ metadata->white_point[0].den = chroma_den; ++ metadata->white_point[1].num = s->sei.mastering_display.white_point[1]; ++ metadata->white_point[1].den = chroma_den; ++ ++ metadata->max_luminance.num = s->sei.mastering_display.max_luminance; ++ metadata->max_luminance.den = luma_den; ++ metadata->min_luminance.num = s->sei.mastering_display.min_luminance; ++ metadata->min_luminance.den = luma_den; ++ metadata->has_luminance = 1; ++ metadata->has_primaries = 1; ++ ++ av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n"); ++ av_log(s->avctx, AV_LOG_DEBUG, ++ "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n", ++ av_q2d(metadata->display_primaries[0][0]), ++ av_q2d(metadata->display_primaries[0][1]), ++ av_q2d(metadata->display_primaries[1][0]), ++ av_q2d(metadata->display_primaries[1][1]), ++ av_q2d(metadata->display_primaries[2][0]), ++ av_q2d(metadata->display_primaries[2][1]), ++ av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1])); ++ av_log(s->avctx, AV_LOG_DEBUG, ++ "min_luminance=%f, max_luminance=%f\n", ++ av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance)); ++ } ++ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1 ++ // so the side data persists for the entire coded video sequence. ++ if (s->sei.content_light.present > 0 && ++ IS_IRAP(s) && s->no_rasl_output_flag) { ++ s->sei.content_light.present--; ++ } ++ if (s->sei.content_light.present) { ++ AVContentLightMetadata *metadata = ++ av_content_light_metadata_create_side_data(out); ++ if (!metadata) ++ return AVERROR(ENOMEM); ++ metadata->MaxCLL = s->sei.content_light.max_content_light_level; ++ metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level; ++ ++ av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n"); ++ av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n", ++ metadata->MaxCLL, metadata->MaxFALL); ++ } ++ ++ if (s->sei.a53_caption.a53_caption) { ++ AVFrameSideData* sd = av_frame_new_side_data(out, ++ AV_FRAME_DATA_A53_CC, ++ s->sei.a53_caption.a53_caption_size); ++ if (sd) ++ memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size); ++ av_freep(&s->sei.a53_caption.a53_caption); ++ s->sei.a53_caption.a53_caption_size = 0; ++ s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS; ++ } ++ ++ if (s->sei.alternative_transfer.present && ++ av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) && ++ s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) { ++ s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics; ++ } ++ ++ return 0; ++} ++ ++static int hevc_frame_start(HEVCRpiContext * const s) ++{ ++ int ret; ++ ++ memset(s->bs_horizontal, 0, s->bs_size * 2); // Does V too ++ memset(s->is_pcm, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height); ++ memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address)); ++ ++ // Only need to remember intra for CIP ++ if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap) ++ s->is_intra = NULL; ++ else ++ { ++ s->is_intra = s->is_intra_store; ++ memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height); ++ } ++ ++ s->is_decoded = 0; ++ s->first_nal_type = s->nal_unit_type; ++ ++ s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos); ++ ++ if (s->pkt.nb_nals > s->rpl_tab_size) ++ { ++ // In most cases it will be faster to free & realloc as that doesn't ++ // require (an unwanted) copy ++ av_freep(&s->rpl_tab); ++ s->rpl_tab_size = 0; ++ if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL) ++ goto fail; ++ s->rpl_tab_size = s->pkt.nb_nals; ++ } ++ memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab)); ++ ++ ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc); ++ if (ret < 0) ++ goto fail; ++ ++ // Resize rpl_tab to max that we might want ++ ret = ff_hevc_rpi_frame_rps(s); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n"); ++ goto fail; ++ } ++ ++ s->ref->frame->key_frame = IS_IRAP(s); ++ ++ ret = set_side_data(s); ++ if (ret < 0) ++ goto fail; ++ ++ s->frame->pict_type = 3 - s->sh.slice_type; ++ ++ if (!IS_IRAP(s)) ++ ff_hevc_rpi_bump_frame(s); ++ ++ av_frame_unref(s->output_frame); ++ ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0); ++ if (ret < 0) ++ goto fail; ++ ++ ff_thread_finish_setup(s->avctx); ++ ++ return 0; ++ ++fail: ++ if (s->ref) ++ ff_hevc_rpi_unref_frame(s, s->ref, ~0); ++ s->ref = NULL; ++ return ret; ++} ++ ++static inline int is_non_ref_unit_type(const unsigned int nal_unit_type) ++{ ++ // From Table 7-1 ++ return (nal_unit_type & ~0xe) == 0; // True for 0, 2, 4, 6, 8, 10, 12, 14 ++} ++ ++static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal) ++{ ++ GetBitContext * const gb = &s->HEVClc->gb; ++ int ctb_addr_ts, ret; ++ ++ *gb = nal->gb; ++ s->nal_unit_type = nal->type; ++ s->temporal_id = nal->temporal_id; ++ ++ switch (s->nal_unit_type) { ++ case HEVC_NAL_VPS: ++ ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps); ++ if (ret < 0) ++ goto fail; ++ break; ++ case HEVC_NAL_SPS: ++ ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps, ++ s->apply_defdispwin); ++ if (ret < 0) ++ goto fail; ++ break; ++ case HEVC_NAL_PPS: ++ ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps); ++ if (ret < 0) ++ goto fail; ++ break; ++ case HEVC_NAL_SEI_PREFIX: ++ case HEVC_NAL_SEI_SUFFIX: ++ ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type); ++ if (ret < 0) ++ goto fail; ++ break; ++ case HEVC_NAL_TRAIL_R: ++ case HEVC_NAL_TRAIL_N: ++ case HEVC_NAL_TSA_N: ++ case HEVC_NAL_TSA_R: ++ case HEVC_NAL_STSA_N: ++ case HEVC_NAL_STSA_R: ++ case HEVC_NAL_BLA_W_LP: ++ case HEVC_NAL_BLA_W_RADL: ++ case HEVC_NAL_BLA_N_LP: ++ case HEVC_NAL_IDR_W_RADL: ++ case HEVC_NAL_IDR_N_LP: ++ case HEVC_NAL_CRA_NUT: ++ case HEVC_NAL_RADL_N: ++ case HEVC_NAL_RADL_R: ++ case HEVC_NAL_RASL_N: ++ case HEVC_NAL_RASL_R: ++ ret = hls_slice_header(s); ++ if (ret < 0) ++ return ret; ++ ++ // The definition of _N unit types is "non-reference for other frames ++ // with the same temporal_id" so they may/will be ref frames for pics ++ // with a higher temporal_id. ++ s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 || ++ !is_non_ref_unit_type(s->nal_unit_type); ++ s->offload_recon = s->threads_type != 0 && s->used_for_ref; ++ s->is_irap = IS_IRAP(s); ++ ++#if DEBUG_DECODE_N ++ { ++ static int z = 0; ++ if (IS_IDR(s)) { ++ z = 1; ++ } ++ if (z != 0 && z++ > DEBUG_DECODE_N) { ++ s->is_decoded = 0; ++ break; ++ } ++ } ++#endif ++ if ( ++ (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) || ++ (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) || ++ (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) || ++ (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IRAP(s))) ++ { ++ s->is_decoded = 0; ++ break; ++ } ++ ++ if (s->sh.first_slice_in_pic_flag) { ++ if (s->max_ra == INT_MAX) { ++ if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) { ++ s->max_ra = s->poc; ++ } else { ++ if (IS_IDR(s)) ++ s->max_ra = INT_MIN; ++ } ++ } ++ ++ if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) && ++ s->poc <= s->max_ra) { ++ s->is_decoded = 0; ++ break; ++ } else { ++ if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra) ++ s->max_ra = INT_MIN; ++ } ++ ++ ret = hevc_frame_start(s); ++ if (ret < 0) ++ return ret; ++ } else if (!s->ref) { ++ av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n"); ++ goto fail; ++ } ++ ++ if (s->nal_unit_type != s->first_nal_type) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Non-matching NAL types of the VCL NALUs: %d %d\n", ++ s->first_nal_type, s->nal_unit_type); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (!s->sh.dependent_slice_segment_flag && ++ s->sh.slice_type != HEVC_SLICE_I) { ++ ret = ff_hevc_rpi_slice_rpl(s); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_WARNING, ++ "Error constructing the reference lists for the current slice.\n"); ++ goto fail; ++ } ++ } ++ ++ ctb_addr_ts = hls_slice_data(s, nal); ++ if (ctb_addr_ts >= s->ps.sps->ctb_size) { ++ s->is_decoded = 1; ++ } ++ ++ if (ctb_addr_ts < 0) { ++ ret = ctb_addr_ts; ++ goto fail; ++ } ++ break; ++ case HEVC_NAL_EOS_NUT: ++ case HEVC_NAL_EOB_NUT: ++ s->seq_decode = (s->seq_decode + 1) & 0xff; ++ s->max_ra = INT_MAX; ++ break; ++ case HEVC_NAL_AUD: ++ case HEVC_NAL_FD_NUT: ++ break; ++ default: ++ av_log(s->avctx, AV_LOG_INFO, ++ "Skipping NAL unit %d\n", s->nal_unit_type); ++ } ++ ++ return 0; ++fail: ++ if (s->avctx->err_recognition & AV_EF_EXPLODE) ++ return ret; ++ return 0; ++} ++ ++static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length) ++{ ++ int i, ret = 0; ++ int eos_at_start = 1; ++ ++ s->ref = NULL; ++ s->last_eos = s->eos; ++ s->eos = 0; ++ ++ /* split the input packet into NAL units, so we know the upper bound on the ++ * number of slices in the frame */ ++ ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff, ++ s->nal_length_size, s->avctx->codec_id, 0, 0); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Error splitting the input into NAL units.\n"); ++ return ret; ++ } ++ ++ for (i = 0; i < s->pkt.nb_nals; i++) { ++ if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT || ++ s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) { ++ if (eos_at_start) { ++ s->last_eos = 1; ++ } else { ++ s->eos = 1; ++ } ++ } else { ++ eos_at_start = 0; ++ } ++ } ++ ++ /* decode the NAL units */ ++ for (i = 0; i < s->pkt.nb_nals; i++) { ++ ret = decode_nal_unit(s, &s->pkt.nals[i]); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_WARNING, ++ "Error parsing NAL unit #%d.\n", i); ++ goto fail; ++ } ++ } ++ ++fail: // Also success path ++ if (s->ref != NULL) { ++ if (s->used_for_ref && s->threads_type != 0) { ++ ff_hevc_rpi_progress_signal_all_done(s); ++ } ++ else { ++ // Flush frame to real memory as we expect to be able to pass ++ // it straight on to mmal ++ flush_frame(s, s->frame); ++ } ++ } ++ return ret; ++} ++ ++static void print_md5(void *log_ctx, int level, uint8_t md5[16]) ++{ ++ int i; ++ for (i = 0; i < 16; i++) ++ av_log(log_ctx, level, "%02"PRIx8, md5[i]); ++} ++ ++static int verify_md5(HEVCRpiContext *s, AVFrame *frame) ++{ ++ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); ++ int pixel_shift; ++ int i, j; ++ ++ if (!desc) ++ return AVERROR(EINVAL); ++ ++ pixel_shift = desc->comp[0].depth > 8; ++ ++ av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ", ++ s->poc); ++ ++ /* the checksums are LE, so we have to byteswap for >8bpp formats ++ * on BE arches */ ++#if HAVE_BIGENDIAN ++ if (pixel_shift && !s->checksum_buf) { ++ av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size, ++ FFMAX3(frame->linesize[0], frame->linesize[1], ++ frame->linesize[2])); ++ if (!s->checksum_buf) ++ return AVERROR(ENOMEM); ++ } ++#endif ++ ++ for (i = 0; frame->data[i]; i++) { ++ int width = s->avctx->coded_width; ++ int height = s->avctx->coded_height; ++ int w = (i == 1 || i == 2) ? (width >> desc->log2_chroma_w) : width; ++ int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height; ++ uint8_t md5[16]; ++ ++ av_md5_init(s->md5_ctx); ++ for (j = 0; j < h; j++) { ++ const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1); ++#if HAVE_BIGENDIAN ++ if (pixel_shift) { ++ s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf, ++ (const uint16_t *) src, w); ++ src = s->checksum_buf; ++ } ++#endif ++ av_md5_update(s->md5_ctx, src, w << pixel_shift); ++ } ++ av_md5_final(s->md5_ctx, md5); ++ ++ if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) { ++ av_log (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i); ++ print_md5(s->avctx, AV_LOG_DEBUG, md5); ++ av_log (s->avctx, AV_LOG_DEBUG, "; "); ++ } else { ++ av_log (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i); ++ print_md5(s->avctx, AV_LOG_ERROR, md5); ++ av_log (s->avctx, AV_LOG_ERROR, " != "); ++ print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]); ++ av_log (s->avctx, AV_LOG_ERROR, "\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ av_log(s->avctx, AV_LOG_DEBUG, "\n"); ++ ++ return 0; ++} ++ ++static int all_sps_supported(const HEVCRpiContext * const s) ++{ ++ for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { ++ if (s->ps.sps_list[i] != NULL) ++ { ++ const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data; ++ if (!is_sps_supported(sps)) ++ return 0; ++ } ++ } ++ return 1; ++} ++ ++static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first) ++{ ++ int ret, i; ++ ++ ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff, ++ &s->nal_length_size, s->avctx->err_recognition, ++ s->apply_defdispwin, s->avctx); ++ if (ret < 0) ++ return ret; ++ ++ /* export stream parameters from the first SPS */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { ++ if (first && s->ps.sps_list[i]) { ++ const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data; ++ export_stream_params(s->avctx, &s->ps, sps); ++ break; ++ } ++ } ++ ++ return 0; ++} ++ ++static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output, ++ AVPacket *avpkt) ++{ ++ int ret; ++ int new_extradata_size; ++ uint8_t *new_extradata; ++ HEVCRpiContext *s = avctx->priv_data; ++ ++ if (!avpkt->size) { ++ ret = ff_hevc_rpi_output_frame(s, data, 1); ++ if (ret < 0) ++ return ret; ++ ++ *got_output = ret; ++ return 0; ++ } ++ ++ new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA, ++ &new_extradata_size); ++ if (new_extradata && new_extradata_size > 0) { ++ ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0); ++ if (ret < 0) ++ return ret; ++ } ++ ++ s->ref = NULL; ++ ret = decode_nal_units(s, avpkt->data, avpkt->size); ++ if (ret < 0) ++ return ret; ++ ++ /* verify the SEI checksum */ ++ if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded && ++ s->sei.picture_hash.is_md5) { ++ ret = verify_md5(s, s->ref->frame); ++ if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) { ++ ff_hevc_rpi_unref_frame(s, s->ref, ~0); ++ return ret; ++ } ++ } ++ s->sei.picture_hash.is_md5 = 0; ++ ++ if (s->is_decoded) { ++ av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc); ++ s->is_decoded = 0; ++ } ++ ++ if (s->output_frame->buf[0]) { ++ av_frame_move_ref(data, s->output_frame); ++ *got_output = 1; ++ } ++ ++ return avpkt->size; ++} ++ ++static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src) ++{ ++ int ret; ++ ++ ret = ff_thread_ref_frame(&dst->tf, &src->tf); ++ if (ret < 0) ++ return ret; ++ ++ if (src->col_mvf_buf != NULL) ++ { ++ dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf); ++ if (!dst->col_mvf_buf) ++ goto fail; ++ } ++ dst->col_mvf = src->col_mvf; ++ ++ dst->poc = src->poc; ++ dst->flags = src->flags; ++ dst->sequence = src->sequence; ++ return 0; ++ ++fail: ++ ff_hevc_rpi_unref_frame(s, dst, ~0); ++ return AVERROR(ENOMEM); ++} ++ ++ ++static av_cold int hevc_decode_free(AVCodecContext *avctx) ++{ ++ HEVCRpiContext * const s = avctx->priv_data; ++ int i; ++ ++ pic_arrays_free(s); ++ ++ av_freep(&s->md5_ctx); ++ ++ av_freep(&s->cabac_save); ++ ++#if RPI_EXTRA_BIT_THREADS ++ bit_threads_kill(s); ++#endif ++ ++ hevc_exit_worker(s); ++ for (i = 0; i != 2; ++i) { ++ ff_hevc_rpi_progress_kill_state(s->progress_states + i); ++ } ++ job_lc_kill(s->HEVClc); ++ ++ av_freep(&s->sao_pixel_buffer_h[0]); // [1] & [2] allocated with [0] ++ av_freep(&s->sao_pixel_buffer_v[0]); ++ av_frame_free(&s->output_frame); ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); ++ av_frame_free(&s->DPB[i].frame); ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) ++ av_buffer_unref(&s->ps.vps_list[i]); ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) ++ av_buffer_unref(&s->ps.sps_list[i]); ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) ++ av_buffer_unref(&s->ps.pps_list[i]); ++ s->ps.sps = NULL; ++ s->ps.pps = NULL; ++ s->ps.vps = NULL; ++ ++ // Free separately from sLists as used that way by RPI WPP ++ for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) { ++ av_freep(s->HEVClcList + i); ++ } ++ s->HEVClc = NULL; // Allocated as part of HEVClcList ++ ++ ff_h2645_packet_uninit(&s->pkt); ++ ++ if (s->qpu_init_ok) ++ vpu_qpu_term(); ++ s->qpu_init_ok = 0; ++ ++ return 0; ++} ++ ++ ++static av_cold int hevc_init_context(AVCodecContext *avctx) ++{ ++ HEVCRpiContext *s = avctx->priv_data; ++ int i; ++ ++ s->avctx = avctx; ++ ++ s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext)); ++ if (!s->HEVClc) ++ goto fail; ++ s->HEVClcList[0] = s->HEVClc; ++ ++ if (vpu_qpu_init() != 0) ++ goto fail; ++ s->qpu_init_ok = 1; ++ ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ { ++ static const uint32_t dframe[1] = {0x80808080}; ++ s->qpu_dummy_frame_emu = (const uint8_t *)dframe; ++ } ++#endif ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++ s->qpu_dummy_frame_qpu = qpu_dummy(); ++#endif ++ ++ bt_lc_init(s, s->HEVClc, 0); ++ job_lc_init(s->HEVClc); ++ ++ for (i = 0; i != 2; ++i) { ++ ff_hevc_rpi_progress_init_state(s->progress_states + i); ++ } ++ ++ if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL) ++ goto fail; ++ ++ if ((s->output_frame = av_frame_alloc()) == NULL) ++ goto fail; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ s->DPB[i].frame = av_frame_alloc(); ++ if (!s->DPB[i].frame) ++ goto fail; ++ s->DPB[i].tf.f = s->DPB[i].frame; ++ s->DPB[i].dpb_no = i; ++ } ++ ++ s->max_ra = INT_MAX; ++ ++ if ((s->md5_ctx = av_md5_alloc()) == NULL) ++ goto fail; ++ ++ s->context_initialized = 1; ++ s->eos = 0; ++ ++ ff_hevc_rpi_reset_sei(&s->sei); ++ ++ return 0; ++ ++fail: ++ av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__); ++ hevc_decode_free(avctx); ++ return AVERROR(ENOMEM); ++} ++ ++#if HAVE_THREADS ++static int hevc_update_thread_context(AVCodecContext *dst, ++ const AVCodecContext *src) ++{ ++ HEVCRpiContext *s = dst->priv_data; ++ HEVCRpiContext *s0 = src->priv_data; ++ int i, ret; ++ ++ av_assert0(s->context_initialized); ++ ++ // dst == src can happen according to the comments and in that case ++ // there is nothing to do here ++ if (dst == src) ++ return 0; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); ++ if (s0->DPB[i].frame->buf[0]) { ++ ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]); ++ if (ret < 0) ++ return ret; ++ } ++ } ++ ++ if (s->ps.sps != s0->ps.sps) ++ s->ps.sps = NULL; ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) { ++ av_buffer_unref(&s->ps.vps_list[i]); ++ if (s0->ps.vps_list[i]) { ++ s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]); ++ if (!s->ps.vps_list[i]) ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { ++ av_buffer_unref(&s->ps.sps_list[i]); ++ if (s0->ps.sps_list[i]) { ++ s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]); ++ if (!s->ps.sps_list[i]) ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) { ++ av_buffer_unref(&s->ps.pps_list[i]); ++ if (s0->ps.pps_list[i]) { ++ s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]); ++ if (!s->ps.pps_list[i]) ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ if (s->ps.sps != s0->ps.sps) ++ if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0) ++ return ret; ++ ++ s->seq_decode = s0->seq_decode; ++ s->seq_output = s0->seq_output; ++ s->pocTid0 = s0->pocTid0; ++ s->max_ra = s0->max_ra; ++ s->eos = s0->eos; ++ s->no_rasl_output_flag = s0->no_rasl_output_flag; ++ ++ s->is_nalff = s0->is_nalff; ++ s->nal_length_size = s0->nal_length_size; ++ ++ s->threads_type = s0->threads_type; ++ ++ if (s0->eos) { ++ s->seq_decode = (s->seq_decode + 1) & 0xff; ++ s->max_ra = INT_MAX; ++ } ++ ++ s->sei.frame_packing = s0->sei.frame_packing; ++ s->sei.display_orientation = s0->sei.display_orientation; ++ s->sei.mastering_display = s0->sei.mastering_display; ++ s->sei.content_light = s0->sei.content_light; ++ s->sei.alternative_transfer = s0->sei.alternative_transfer; ++ ++ // * We do this here as it allows us to easily locate our parents ++ // global job pool, but there really should be a less nasty way ++ if (s->jbc == NULL) ++ { ++ av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL); ++ hevc_init_worker(s); ++ } ++ ++ return 0; ++} ++#endif ++ ++#include ++static int qpu_ok(void) ++{ ++ static int is_pi3 = -1; ++ if (is_pi3 == -1) ++ { ++ struct stat sb; ++ is_pi3 = (stat("/dev/rpivid-intcmem", &sb) != 0); ++ } ++ return is_pi3; ++} ++ ++static av_cold int hevc_decode_init(AVCodecContext *avctx) ++{ ++ HEVCRpiContext *s = avctx->priv_data; ++ int ret; ++ ++ if (!qpu_ok()) ++ return AVERROR_DECODER_NOT_FOUND; ++ ++ if ((ret = hevc_init_context(avctx)) < 0) ++ return ret; ++ ++ // If we are a child context then stop now ++ // Everything after this point is either 1st decode setup or global alloc ++ // that must not be repeated ++ // Global info will be copied into children in update_thread_context (we ++ // can't do it here as we have no way of finding the parent context) ++ if (avctx->internal->is_copy) ++ return 0; ++ ++ // Job allocation requires VCSM alloc to work so ensure that we have it ++ // initialised by this point ++ { ++ HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5)); ++ if (jbg == NULL) { ++ av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__); ++ ret = AVERROR(ENOMEM); ++ goto fail; ++ } ++ ++ if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL) { ++ av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__); ++ ret = AVERROR(ENOMEM); ++ goto fail; ++ } ++ } ++ ++ hevc_init_worker(s); ++ ++ s->eos = 1; ++ ++ if (avctx->extradata_size > 0 && avctx->extradata) { ++ if ((ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1)) < 0) ++ goto fail; ++ ++ if (!all_sps_supported(s)) { ++ ret = AVERROR_DECODER_NOT_FOUND; ++ goto fail; ++ } ++ } ++ ++ if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1) ++ s->threads_type = FF_THREAD_FRAME; ++ else ++ s->threads_type = 0; ++ ++ return 0; ++ ++fail: ++ hevc_decode_free(avctx); ++ return ret; ++} ++ ++static void hevc_decode_flush(AVCodecContext *avctx) ++{ ++ HEVCRpiContext *s = avctx->priv_data; ++ ff_hevc_rpi_flush_dpb(s); ++ s->max_ra = INT_MAX; ++ s->eos = 1; ++} ++ ++typedef struct hwaccel_rpi3_qpu_env_s { ++ const AVClass *av_class; ++ AVZcEnvPtr zc; ++} hwaccel_rpi3_qpu_env_t; ++ ++static int hwaccel_alloc_frame(AVCodecContext *s, AVFrame *frame) ++{ ++ hwaccel_rpi3_qpu_env_t * const r3 = s->internal->hwaccel_priv_data; ++ int rv; ++ ++ if (av_rpi_zc_in_use(s)) ++ { ++ rv = s->get_buffer2(s, frame, 0); ++ } ++ else ++ { ++ rv = av_rpi_zc_get_buffer(r3->zc, frame); ++ if (rv == 0) ++ rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID); // actually do the alloc ++ } ++ ++ if (rv == 0 && ++ (rv = ff_attach_decode_data(frame)) < 0) ++ { ++ av_frame_unref(frame); ++ } ++ ++ return rv; ++} ++ ++static int hwaccel_rpi3_qpu_free(AVCodecContext *avctx) ++{ ++ hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data; ++ av_rpi_zc_int_env_freep(&r3->zc); ++ return 0; ++} ++ ++static int hwaccel_rpi3_qpu_init(AVCodecContext *avctx) ++{ ++ hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data; ++ ++ if ((r3->zc = av_rpi_zc_int_env_alloc(avctx)) == NULL) ++ goto fail; ++ ++ return 0; ++ ++fail: ++ av_log(avctx, AV_LOG_ERROR, "Rpi3 QPU init failed\n"); ++ hwaccel_rpi3_qpu_free(avctx); ++ return AVERROR(ENOMEM); ++} ++ ++ ++#define OFFSET(x) offsetof(HEVCRpiContext, x) ++#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM) ++ ++ ++static const AVOption options[] = { ++ { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin), ++ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR }, ++ { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin), ++ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR }, ++ { NULL }, ++}; ++ ++static const AVClass hevc_rpi_decoder_class = { ++ .class_name = "HEVC RPI decoder", ++ .item_name = av_default_item_name, ++ .option = options, ++ .version = LIBAVUTIL_VERSION_INT, ++}; ++ ++static const enum AVPixelFormat hevc_rpi_pix_fmts[] = { ++ AV_PIX_FMT_SAND128, ++ AV_PIX_FMT_SAND64_10, ++ AV_PIX_FMT_NONE ++}; ++ ++ ++static const AVHWAccel hwaccel_rpi3_qpu = { ++ .name = "Pi3 QPU Hwaccel", ++ .alloc_frame = hwaccel_alloc_frame, ++ .init = hwaccel_rpi3_qpu_init, ++ .uninit = hwaccel_rpi3_qpu_free, ++ .priv_data_size = sizeof(hwaccel_rpi3_qpu_env_t), ++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, ++}; ++ ++static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand128 = ++{ ++ .public = { ++ .pix_fmt = AV_PIX_FMT_SAND128, ++ .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC, ++ .device_type = AV_HWDEVICE_TYPE_NONE, ++ }, ++ .hwaccel = &hwaccel_rpi3_qpu ++}; ++static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand64_10 = ++{ ++ .public = { ++ .pix_fmt = AV_PIX_FMT_SAND64_10, ++ .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC, ++ .device_type = AV_HWDEVICE_TYPE_NONE, ++ }, ++ .hwaccel = &hwaccel_rpi3_qpu ++}; ++ ++ ++static const AVCodecHWConfigInternal *hevc_rpi_hw_configs[] = { ++ &hevc_rpi_hw_config_sand128, ++ &hevc_rpi_hw_config_sand64_10, ++ NULL ++}; ++ ++ ++AVCodec ff_hevc_rpi_decoder = { ++ .name = "hevc_rpi", ++ .long_name = NULL_IF_CONFIG_SMALL("HEVC (rpi)"), ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_HEVC, ++ .priv_data_size = sizeof(HEVCRpiContext), ++ .priv_class = &hevc_rpi_decoder_class, ++ .init = hevc_decode_init, ++ .close = hevc_decode_free, ++ .decode = hevc_rpi_decode_frame, ++ .flush = hevc_decode_flush, ++ .update_thread_context = ONLY_IF_THREADS_ENABLED(hevc_update_thread_context), ++ .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | ++ AV_CODEC_CAP_HARDWARE | ++ AV_CODEC_CAP_AVOID_PROBING | ++#if 0 ++ // Debugging is often easier without threads getting in the way ++ 0, ++#warning H265 threading turned off ++#else ++ // We only have decent optimisation for frame - so only admit to that ++ AV_CODEC_CAP_FRAME_THREADS, ++#endif ++ .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE | ++ FF_CODEC_CAP_EXPORTS_CROPPING | ++ FF_CODEC_CAP_ALLOCATE_PROGRESS, ++ .pix_fmts = hevc_rpi_pix_fmts, ++ .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles), ++ .hw_configs = hevc_rpi_hw_configs, ++// .wrapper_name = "hevc_rpi", ++}; ++ +diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h +new file mode 100644 +index 0000000000..1f94d18673 +--- /dev/null ++++ b/libavcodec/rpi_hevcdec.h +@@ -0,0 +1,1091 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVCDEC_H ++#define AVCODEC_RPI_HEVCDEC_H ++ ++#include "config.h" ++ ++#include ++ ++#include "libavutil/buffer.h" ++ ++#include "avcodec.h" ++#include "bswapdsp.h" ++#include "cabac.h" ++#include "get_bits.h" ++#include "rpi_hevcpred.h" ++#include "h2645_parse.h" ++#include "hevc.h" ++#include "rpi_hevc_mv.h" ++#include "rpi_hevc_ps.h" ++#include "rpi_hevc_sei.h" ++#include "rpi_hevcdsp.h" ++#include "internal.h" ++#include "thread.h" ++#include "videodsp.h" ++ ++#if ARCH_ARM ++#include "arm/rpi_hevc_misc_neon.h" ++#endif ++ ++#define MAX_NB_THREADS 16 ++#define SHIFT_CTB_WPP 2 ++ ++//TODO: check if this is really the maximum ++#define MAX_TRANSFORM_DEPTH 5 ++ ++#define MAX_TB_SIZE 32 ++#define MAX_QP 51 ++#define DEFAULT_INTRA_TC_OFFSET 2 ++ ++#define HEVC_CONTEXTS 199 ++ ++#define MRG_MAX_NUM_CANDS 5 ++ ++#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE) // 64 ++ ++// Size of DPB array ++#define HEVC_DPB_ELS 32 ++ ++#define L0 0 ++#define L1 1 ++ ++#define EPEL_EXTRA_BEFORE 1 ++#define EPEL_EXTRA_AFTER 2 ++#define EPEL_EXTRA 3 ++#define QPEL_EXTRA_BEFORE 3 ++#define QPEL_EXTRA_AFTER 4 ++#define QPEL_EXTRA 7 ++ ++#define EDGE_EMU_BUFFER_STRIDE 80 ++ ++#include ++#include "rpi_qpu.h" ++ ++// Max jobs per frame thread. Actual usage will be limited by the size ++// of the global job pool ++// ?? Limits ++#define RPI_MAX_JOBS 8 ++ ++// This is the number of _extra_ bit threads - we will have ++// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing ++// ++// 0 is legitimate and will disable our WPP processing ++//#define RPI_EXTRA_BIT_THREADS 0 ++#define RPI_EXTRA_BIT_THREADS 2 ++ ++// Number of separate threads/passes in worker ++// 2 and 3 are the currently valid numbers ++// At the moment 3 seems fractionally faster ++//#define RPI_PASSES 2 ++#define RPI_PASSES 3 ++ ++// Print out various usage stats ++#define RPI_TSTATS 0 ++ ++// Define RPI_COMPRESS_COEFFS to 1 to send coefficients in compressed form ++#define RPI_COMPRESS_COEFFS 1 ++ ++// Wait for VPU/QPU to finish in worker pass 0 ++// If 0 then the wait is in pass 1 ++// ++// One might expect the better place to wait would be in pass 1 however ++// testing shows that pass 0 produces overall faster decode. ++// Interestingly it is QPU/VPU limited streams that seem to suffer ++// from pass 1 waits, CPU limited ones tend to show a very mild gain. ++// This define exists so it is easy to test this. ++#define RPI_WORKER_WAIT_PASS_0 1 ++ ++// Use ARM emulation of QPU pred ++// These are for debug only as the emulation makes only limited ++// effort to be fast ++#define RPI_QPU_EMU_Y 0 ++#define RPI_QPU_EMU_C 0 ++ ++// Max width & height we are prepared to consider ++// Sand frame shape calc becomes confused with large frames ++// Some buffer alloc also depends on this ++#define HEVC_RPI_MAX_WIDTH 2048 ++#define HEVC_RPI_MAX_HEIGHT 1088 ++ ++ ++// Min CTB size is 16 ++#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16) ++ ++/** ++ * Value of the luma sample at position (x, y) in the 2D array tab. ++ */ ++#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)]) ++#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)]) ++ ++#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP) ++#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \ ++ (s)->nal_unit_type == HEVC_NAL_BLA_N_LP) ++#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23) ++ ++enum RPSType { ++ ST_CURR_BEF = 0, ++ ST_CURR_AFT, ++ ST_FOLL, ++ LT_CURR, ++ LT_FOLL, ++ NB_RPS_TYPE, ++}; ++ ++enum SyntaxElement { ++ SAO_MERGE_FLAG = 0, ++ SAO_TYPE_IDX, ++ SAO_EO_CLASS, ++ SAO_BAND_POSITION, ++ SAO_OFFSET_ABS, ++ SAO_OFFSET_SIGN, ++ END_OF_SLICE_FLAG, ++ SPLIT_CODING_UNIT_FLAG, ++ CU_TRANSQUANT_BYPASS_FLAG, ++ SKIP_FLAG, ++ CU_QP_DELTA, ++ PRED_MODE_FLAG, ++ PART_MODE, ++ PCM_FLAG, ++ PREV_INTRA_LUMA_PRED_FLAG, ++ MPM_IDX, ++ REM_INTRA_LUMA_PRED_MODE, ++ INTRA_CHROMA_PRED_MODE, ++ MERGE_FLAG, ++ MERGE_IDX, ++ INTER_PRED_IDC, ++ REF_IDX_L0, ++ REF_IDX_L1, ++ ABS_MVD_GREATER0_FLAG, ++ ABS_MVD_GREATER1_FLAG, ++ ABS_MVD_MINUS2, ++ MVD_SIGN_FLAG, ++ MVP_LX_FLAG, ++ NO_RESIDUAL_DATA_FLAG, ++ SPLIT_TRANSFORM_FLAG, ++ CBF_LUMA, ++ CBF_CB_CR, ++ TRANSFORM_SKIP_FLAG, ++ EXPLICIT_RDPCM_FLAG, ++ EXPLICIT_RDPCM_DIR_FLAG, ++ LAST_SIGNIFICANT_COEFF_X_PREFIX, ++ LAST_SIGNIFICANT_COEFF_Y_PREFIX, ++ LAST_SIGNIFICANT_COEFF_X_SUFFIX, ++ LAST_SIGNIFICANT_COEFF_Y_SUFFIX, ++ SIGNIFICANT_COEFF_GROUP_FLAG, ++ SIGNIFICANT_COEFF_FLAG, ++ COEFF_ABS_LEVEL_GREATER1_FLAG, ++ COEFF_ABS_LEVEL_GREATER2_FLAG, ++ COEFF_ABS_LEVEL_REMAINING, ++ COEFF_SIGN_FLAG, ++ LOG2_RES_SCALE_ABS, ++ RES_SCALE_SIGN_FLAG, ++ CU_CHROMA_QP_OFFSET_FLAG, ++ CU_CHROMA_QP_OFFSET_IDX, ++}; ++ ++enum PartMode { ++ PART_2Nx2N = 0, ++ PART_2NxN = 1, ++ PART_Nx2N = 2, ++ PART_NxN = 3, ++ PART_2NxnU = 4, ++ PART_2NxnD = 5, ++ PART_nLx2N = 6, ++ PART_nRx2N = 7, ++}; ++ ++enum PredMode { ++ MODE_INTER = 0, ++ MODE_INTRA, ++ MODE_SKIP, ++}; ++ ++enum InterPredIdc { ++ PRED_L0 = 0, ++ PRED_L1, ++ PRED_BI, ++}; ++ ++enum PredFlag { ++ PF_INTRA = 0, ++ PF_L0, ++ PF_L1, ++ PF_BI, ++}; ++ ++enum SAOType { ++ SAO_NOT_APPLIED = 0, ++ SAO_BAND, ++ SAO_EDGE, ++ SAO_APPLIED ++}; ++ ++enum SAOEOClass { ++ SAO_EO_HORIZ = 0, ++ SAO_EO_VERT, ++ SAO_EO_135D, ++ SAO_EO_45D, ++}; ++ ++enum ScanType { ++ SCAN_DIAG = 0, ++ SCAN_HORIZ, ++ SCAN_VERT, ++}; ++ ++typedef struct RefPicList { ++ struct HEVCRpiFrame *ref[HEVC_MAX_REFS]; ++ int list[HEVC_MAX_REFS]; ++ uint8_t isLongTerm[HEVC_MAX_REFS]; ++ int nb_refs; ++} RefPicList; ++ ++typedef struct RefPicListTab { ++ RefPicList refPicList[2]; ++} RefPicListTab; ++ ++typedef struct RpiCodingUnit { ++ unsigned int x; // Passed to deblock ++ unsigned int y; ++ unsigned int x_split; ++ unsigned int y_split; ++ ++ enum PredMode pred_mode; ///< PredMode ++ enum PartMode part_mode; ///< PartMode ++ ++ // Inferred parameters ++ uint8_t intra_split_flag; ///< IntraSplitFlag ++ uint8_t max_trafo_depth; ///< MaxTrafoDepth ++ uint8_t cu_transquant_bypass_flag; ++} RpiCodingUnit; ++ ++typedef struct RpiPredictionUnit { ++ uint8_t intra_pred_mode[4]; ++ uint8_t intra_pred_mode_c[4]; ++ uint8_t chroma_mode_c[4]; ++ uint8_t merge_flag; ++} RpiPredictionUnit; ++ ++typedef struct HEVCRpiTransformUnit { ++ int8_t cu_qp_delta; ++ ++ // Inferred parameters; ++ uint8_t intra_pred_mode; ++ uint8_t intra_pred_mode_c; ++ uint8_t chroma_mode_c; ++ uint8_t is_cu_qp_delta_wanted; ++ uint8_t cu_chroma_qp_offset_wanted; ++ const int8_t * qp_divmod6[3]; ++} HEVCRpiTransformUnit; ++ ++typedef struct DBParams { ++ int8_t beta_offset; // -12 to +12 ++ int8_t tc_offset; // -12 to +12 ++} DBParams; ++ ++#define HEVC_FRAME_FLAG_OUTPUT (1 << 0) ++#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1) ++#define HEVC_FRAME_FLAG_LONG_REF (1 << 2) ++#define HEVC_FRAME_FLAG_BUMPING (1 << 3) ++ ++struct HEVCRpiJob; ++ ++typedef struct HEVCRpiFrame { ++ AVFrame *frame; ++ ThreadFrame tf; ++ ColMvField *col_mvf; ++ int poc; ++ struct HEVCRpiFrame *collocated_ref; ++ ++ AVBufferRef *col_mvf_buf; ++ ++ /** ++ * A sequence counter, so that old frames are output first ++ * after a POC reset ++ */ ++ uint16_t sequence; ++ ++ /** ++ * A combination of HEVC_FRAME_FLAG_* ++ */ ++ uint8_t flags; ++ ++ // Entry no in DPB - can be used as a small unique ++ // frame identifier (within the current thread) ++ uint8_t dpb_no; ++} HEVCRpiFrame; ++ ++typedef struct HEVCRpiLocalContext { ++ HEVCRpiTransformUnit tu; ++ ++ CABACContext cc; ++ ++ // Vars that allow us to locate everything from just an lc ++ struct HEVCRpiContext * context; // ??? make const ??? ++ unsigned int lc_n; // lc list el no ++ ++ // Job wait links ++ struct HEVCRpiLocalContext * jw_next; ++ struct HEVCRpiLocalContext * jw_prev; ++ struct HEVCRpiLocalContext * ljw_next; ++ struct HEVCRpiLocalContext * ljw_prev; ++ struct HEVCRpiJob * volatile jw_job; ++ sem_t jw_sem; ++ ++ // ?? Wrap in structure ?? ++ sem_t bt_sem_in; ++ sem_t * bt_psem_out; ++ volatile int bt_terminate; ++ unsigned int ts; ++ unsigned int bt_last_line; // Last line in this bit_thread chunk ++ unsigned int bt_line_no; ++ unsigned int bt_line_width; ++ unsigned int bt_line_inc; ++ ++ struct HEVCRpiJob * jb0; ++ char unit_done; // Set once we have dealt with this slice ++ char bt_is_tile; ++ char last_progress_good; ++ char cabac_init_req; ++ ++ uint8_t cabac_state[HEVC_CONTEXTS]; ++ uint8_t stat_coeff[4]; ++ GetBitContext gb; ++ ++ uint8_t ct_depth; ++ int8_t qp_y; ++ int8_t curr_qp_y; ++ int8_t qPy_pred; ++ ++// N.B. Used by asm (neon) - do not change ++#define AVAIL_S_UR 0 ++#define AVAIL_S_U 1 ++#define AVAIL_S_UL 2 ++#define AVAIL_S_L 3 ++#define AVAIL_S_DL 4 ++ ++#define AVAIL_U (1 << AVAIL_S_U) ++#define AVAIL_L (1 << AVAIL_S_L) ++#define AVAIL_UL (1 << AVAIL_S_UL) ++#define AVAIL_UR (1 << AVAIL_S_UR) ++#define AVAIL_DL (1 << AVAIL_S_DL) ++ ++// Intra filters - same number space as avail ++#define FILTER_LIGHT 0x40 ++#define FILTER_STRONG 0x80 ++#define FILTER_EITHER (FILTER_LIGHT | FILTER_STRONG) ++ ++ uint8_t ctb_avail; ++ int end_of_ctb_x; ++ int end_of_ctb_y; ++ ++ RpiCodingUnit cu; ++ RpiPredictionUnit pu; ++ ++#define BOUNDARY_LEFT_SLICE (1 << 0) ++#define BOUNDARY_LEFT_TILE (1 << 1) ++#define BOUNDARY_UPPER_SLICE (1 << 2) ++#define BOUNDARY_UPPER_TILE (1 << 3) ++ /* properties of the boundary of the current CTB for the purposes ++ * of the deblocking filter */ ++ unsigned int boundary_flags; ++ ++#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE) ++ uint8_t ipm_left[IPM_TAB_SIZE]; ++ uint8_t ipm_up[IPM_TAB_SIZE]; ++ ++//#define MVF_STASH_WIDTH 128 ++#define MVF_STASH_WIDTH 64 ++#define MVF_STASH_HEIGHT 64 ++#define MVF_STASH_WIDTH_PU (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE) ++#define MVF_STASH_HEIGHT_PU (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE) ++ HEVCRpiMvField mvf_ul[1]; ++ HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU]; ++ ++ /* +7 is for subpixel interpolation, *2 for high bit depths */ ++// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; ++ /* The extended size between the new edge emu buffer is abused by SAO */ ++// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; ++// DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]); ++ ++} HEVCRpiLocalContext; ++ ++// Each block can have an intra prediction and an add_residual command ++// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH ++ ++// Sand only has 2 planes (Y/C) ++#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4)) ++ ++// Command for intra prediction and transform_add of predictions to coefficients ++enum rpi_pred_cmd_e ++{ ++ RPI_PRED_ADD_RESIDUAL, ++ RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx ++ RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx ++ RPI_PRED_ADD_RESIDUAL_C, // Merged U+V ++ RPI_PRED_ADD_DC, ++ RPI_PRED_ADD_DC_U, // Both U & V are effectively C ++ RPI_PRED_ADD_DC_V, ++ RPI_PRED_INTRA, ++ RPI_PRED_INTRA_C, ++ RPI_PRED_I_PCM, ++ RPI_PRED_CMD_MAX ++}; ++ ++typedef struct HEVCPredCmd { ++ uint8_t type; ++ uint8_t size; // log2 "size" used by all variants ++ uint8_t avail; // i_pred - but left here as they pack well ++ uint8_t dummy; ++ union { ++ struct { // TRANSFORM_ADD ++ uint8_t * dst; ++ const int16_t * buf; ++ uint16_t stride; // Should be good enough for all pic fmts we use ++ int16_t dc; ++ } ta; ++ struct { ++ uint8_t * dst; ++ uint32_t stride; ++ int dc; ++ } dc; ++ struct { // INTRA ++ uint16_t x; ++ uint16_t y; ++ enum IntraPredMode mode; ++ } i_pred; ++ struct { // I_PCM ++ uint16_t x; ++ uint16_t y; ++ const void * src; ++ uint32_t src_len; ++ } i_pcm; ++ }; ++} HEVCPredCmd; ++ ++union qpu_mc_pred_cmd_s; ++struct qpu_mc_pred_y_p_s; ++struct qpu_mc_src_s; ++ ++typedef struct HEVCRpiInterPredQ ++{ ++ union qpu_mc_pred_cmd_u *qpu_mc_base; ++ union qpu_mc_pred_cmd_u *qpu_mc_curr; ++ struct qpu_mc_src_s *last_l0; ++ struct qpu_mc_src_s *last_l1; ++ unsigned int load; ++ uint32_t code_setup; ++ uint32_t code_sync; ++ uint32_t code_exit; ++} HEVCRpiInterPredQ; ++ ++typedef struct HEVCRpiInterPredEnv ++{ ++ HEVCRpiInterPredQ * q; ++ uint8_t n; // Number of Qs ++ uint8_t n_grp; // Number of Q in a group ++ uint8_t curr; // Current Q number (0..n-1) ++ uint8_t used; // 0 if nothing in any Q, 1 otherwise ++ uint8_t used_grp; // 0 if nothing in any Q in the current group ++ unsigned int max_fill; ++ unsigned int min_gap; ++ GPU_MEM_PTR_T gptr; ++} HEVCRpiInterPredEnv; ++ ++typedef struct HEVCRpiIntraPredEnv { ++ unsigned int n; // Number of commands ++ HEVCPredCmd * cmds; ++} HEVCRpiIntraPredEnv; ++ ++typedef struct HEVCRpiCoeffEnv { ++ unsigned int n; ++#if RPI_COMPRESS_COEFFS ++ unsigned int packed; // Equal to 1 if coefficients should be being packed ++ unsigned int packed_n; // Value of n when packed was set equal to 0 (i.e. the amount that is sent compressed). Only valid if packed==0 ++#endif ++ int16_t * buf; ++} HEVCRpiCoeffEnv; ++ ++typedef struct HEVCRpiCoeffsEnv { ++ HEVCRpiCoeffEnv s[4]; ++ GPU_MEM_PTR_T gptr; ++ void * mptr; ++} HEVCRpiCoeffsEnv; ++ ++typedef struct HEVCRpiFrameProgressWait { ++ int req; ++ struct HEVCRpiFrameProgressWait * next; ++ sem_t sem; ++} HEVCRpiFrameProgressWait; ++ ++typedef struct HEVCRpiFrameProgressState { ++ struct HEVCRpiFrameProgressWait * first; ++ struct HEVCRpiFrameProgressWait * last; ++ pthread_mutex_t lock; ++} HEVCRpiFrameProgressState; ++ ++typedef struct RpiBlk ++{ ++ unsigned int x; ++ unsigned int y; ++ unsigned int w; ++ unsigned int h; ++} RpiBlk; ++ ++typedef struct HEVCRpiJob { ++ struct HEVCRpiJob * next; // Free chain ++ struct HEVCRpiJobCtl * jbc_local; ++ const HEVCRpiSPS * sps; // sps used to set up this job ++ ++ int waited; ++ int ctu_ts_first; ++ int ctu_ts_last; ++ RpiBlk bounds; // Bounding box of job ++ ++ struct qpu_mc_pred_y_p_s * last_y8_p; ++ struct qpu_mc_src_s * last_y8_l1; ++ rpi_cache_flush_env_t * rfe; ++ ++ HEVCRpiInterPredEnv chroma_ip; ++ HEVCRpiInterPredEnv luma_ip; ++ int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no ++ HEVCRpiIntraPredEnv intra; ++ HEVCRpiCoeffsEnv coeffs; ++ HEVCRpiFrameProgressWait progress_wait; ++ sem_t sem; ++ rpi_cache_buf_t flush_buf; ++} HEVCRpiJob; ++ ++struct HEVCRpiContext; ++ ++typedef void HEVCRpiWorkerFn(const struct HEVCRpiContext * const s, HEVCRpiJob * const jb); ++ ++typedef struct HEVCRpiPassQueue ++{ ++// int pending; ++ volatile int terminate; ++ sem_t sem_in; ++ sem_t * psem_out; ++ unsigned int job_n; ++ struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread ++ HEVCRpiWorkerFn * worker; ++ pthread_t thread; ++ uint8_t pass_n; // Pass number - debug ++ uint8_t started; ++} HEVCRpiPassQueue; ++ ++ ++struct HEVCRpiJobGlobal; ++ ++typedef struct HEVCRpiJobCtl ++{ ++ sem_t sem_out; ++ ++ HEVCRpiJob * volatile jb1; // The job associated with this frame if unallocated - NULL if allocated ++ struct HEVCRpiJobGlobal * jbg; ++ ++ HEVCRpiLocalContext * lcw_head; ++ HEVCRpiLocalContext * lcw_tail; ++ ++ pthread_mutex_t in_lock; ++ int offload_in; ++ ++ HEVCRpiJob *offloadq[RPI_MAX_JOBS]; ++} HEVCRpiJobCtl; ++ ++ ++typedef struct HEVCRpiJobGlobal ++{ ++ intptr_t ref_count; ++ pthread_mutex_t lock; ++ HEVCRpiJob * free1; // Singly linked list of free jobs ++ HEVCRpiLocalContext * wait_head; // Double linked list of lcs waiting for a job ++ HEVCRpiLocalContext * wait_good; // Last good tail ++ HEVCRpiLocalContext * wait_tail; ++ ++} HEVCRpiJobGlobal; ++ ++#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1) ++ ++#if RPI_TSTATS ++typedef struct HEVCRpiStats { ++ int y_pred1_y8_merge; ++ int y_pred1_xy; ++ int y_pred1_x0; ++ int y_pred1_y0; ++ int y_pred1_x0y0; ++ int y_pred1_wle8; ++ int y_pred1_wgt8; ++ int y_pred1_hle16; ++ int y_pred1_hgt16; ++ int y_pred2_xy; ++ int y_pred2_x0; ++ int y_pred2_y0; ++ int y_pred2_x0y0; ++ int y_pred2_hle16; ++ int y_pred2_hgt16; ++} HEVCRpiStats; ++#endif ++ ++typedef struct HEVCRpiCabacState ++{ ++ uint8_t rice[4]; ++ uint8_t state[HEVC_CONTEXTS]; ++} HEVCRpiCabacState; ++ ++#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT 6 // 64 pels ++#define HEVC_RPI_BS_STRIDE1_PELS (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT) ++#define HEVC_RPI_BS_STRIDE1_PEL_MASK (HEVC_RPI_BS_STRIDE1_PELS - 1) ++#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT 2 // 4 els per byte ++#define HEVC_RPI_BS_PELS_PER_EL_SHIFT 2 // 4 pels per el ++#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT) ++#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) ++#define HEVC_RPI_BS_STRIDE1_BYTES (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) ++#define HEVC_RPI_BS_Y_SHR 3 // 8 vertical pels per row ++#define HEVC_RPI_BS_COL_BYTES_SHR (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) ++ ++typedef struct HEVCRpiContext { ++ const AVClass *c; // needed by private avoptions ++ AVCodecContext *avctx; ++ ++ uint8_t threads_type; ++ char qpu_init_ok; ++ ++ /** 1 if the independent slice segment header was successfully parsed */ ++ uint8_t slice_initialized; ++ char used_for_ref; // rpi ++ char is_irap; ++ char offload_recon; ++ uint8_t eos; ///< current packet contains an EOS/EOB NAL ++ uint8_t last_eos; ///< last packet contains an EOS/EOB NAL ++ uint8_t no_backward_pred_flag; ++ uint8_t is_decoded; ++ uint8_t no_rasl_output_flag; ++ ++ ++ /** ++ * Sequence counters for decoded and output frames, so that old ++ * frames are output first after a POC reset ++ */ ++ uint16_t seq_decode; ++ uint16_t seq_output; ++ ++ int width; ++ int height; ++ ++ HEVCRpiJobCtl * jbc; ++ // cabac stash ++ // b0 skip flag ++ // b1+ ct_depth ++ uint8_t * cabac_stash_left; ++ uint8_t * cabac_stash_up; ++ ++ // Function pointers ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ const uint8_t * qpu_dummy_frame_emu; ++#endif ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++ uint32_t qpu_dummy_frame_qpu; // Not a frame - just a bit of memory ++#endif ++ HEVCRpiQpu qpu; ++ ++ HEVCRpiFrameProgressState progress_states[2]; ++ ++ HEVCRpiCabacState *cabac_save; ++ ++ AVFrame *frame; ++ AVFrame *output_frame; ++ uint8_t *sao_pixel_buffer_h[3]; ++ uint8_t *sao_pixel_buffer_v[3]; ++ ++ unsigned int col_mvf_stride; ++ AVBufferPool *col_mvf_pool; ++ ++ RpiSAOParams *sao; ++ DBParams *deblock; ++ enum HEVCNALUnitType nal_unit_type; ++ int temporal_id; ///< temporal_id_plus1 - 1 ++ HEVCRpiFrame *ref; ++ int poc; ++ int pocTid0; ++ int slice_idx; ///< number of the slice being currently decoded ++ int max_ra; ++ ++ int8_t *qp_y_tab; ++ ++ // Deblocking block strength bitmaps ++ unsigned int bs_stride2; ++ unsigned int bs_size; ++ uint8_t *bs_horizontal; ++ uint8_t *bs_vertical; ++ uint8_t *bsf_stash_up; ++ uint8_t *bsf_stash_left; ++ ++#if HEVC_RPI_MAX_CTBS >= 0xffff ++#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0 ++ uint32_t *tab_slice_address; ++#else ++#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0 ++ uint16_t *tab_slice_address; ++#endif ++ ++ // Bitfield 1 bit per 8 pels (min pcm size) ++ uint8_t *is_pcm; ++ // Bitfield 1 bit per 8 pels (min cb size) ++ // Only needed for CIP as CIP processing is async to the main thread ++ uint8_t *is_intra; ++ ++ // PU ++ HEVCRpiMvField *mvf_up; ++ HEVCRpiMvField *mvf_left; ++ ++ const RefPicList **rpl_up; ++ const RefPicList **rpl_left; ++ RefPicList * refPicList; ++ ++ // CTB-level flags affecting loop filter operation ++ uint8_t *filter_slice_edges; ++ ++ /** used on BE to byteswap the lines for checksumming */ ++ uint8_t *checksum_buf; ++ int checksum_buf_size; ++ ++ const uint8_t *data; ++ ++ H2645Packet pkt; ++ // type of the first VCL NAL of the current frame ++ enum HEVCNALUnitType first_nal_type; ++ ++ uint8_t context_initialized; ++ int is_nalff; ///< this flag is != 0 if bitstream is encapsulated ++ ///< as a format defined in 14496-15 ++ int apply_defdispwin; ++ ++ int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4) ++ int nuh_layer_id; ++ ++ struct AVMD5 *md5_ctx; ++ ++ RefPicListTab * rpl_tab; ++ unsigned int rpl_tab_size; ++ ++ uint8_t *is_intra_store; ++ ++ RpiSliceHeader sh; ++ ++ HEVCRpiParamSets ps; ++ ++ HEVCRpiLocalContext *HEVClc; ++ HEVCRpiLocalContext *HEVClcList[MAX_NB_THREADS]; ++ ++ HEVCRpiFrame DPB[HEVC_DPB_ELS]; ++ ++ ///< candidate references for the current frame ++ RefPicList rps[5]; ++ ++ HEVCRpiPredContext hpc; ++ HEVCDSPContext hevcdsp; ++ ++ HEVCSEIContext sei; ++ ++ // Put structures that allocate non-trivial storage at the end ++ // These are mostly used indirectly so position in the structure doesn't matter ++ HEVCRpiPassQueue passq[RPI_PASSES]; ++#if RPI_EXTRA_BIT_THREADS > 0 ++ int bt_started; ++ // This simply contains thread descriptors - task setup is held elsewhere ++ pthread_t bit_threads[RPI_EXTRA_BIT_THREADS]; ++#endif ++#if RPI_TSTATS ++ HEVCRpiStats tstats; ++#endif ++} HEVCRpiContext; ++ ++/** ++ * Mark all frames in DPB as unused for reference. ++ */ ++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s); ++ ++/** ++ * Drop all frames currently in DPB. ++ */ ++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s); ++ ++/** ++ * Construct the reference picture sets for the current frame. ++ */ ++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s); ++ ++/** ++ * Construct the reference picture list(s) for the current slice. ++ */ ++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s); ++ ++ ++/** ++ * Get the number of candidate references for the current frame. ++ */ ++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s); ++ ++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc); ++ ++/** ++ * Find next frame in output order and put a reference to it in frame. ++ * @return 1 if a frame was output, 0 otherwise ++ */ ++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush); ++ ++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s); ++ ++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags); ++ ++unsigned int ff_hevc_rpi_tb_avail_flags( ++ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h); ++ ++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW, ++ int nPbH, int log2_cb_size, int part_idx, ++ int merge_idx, HEVCRpiMvField * const mv); ++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int nPbW, const unsigned int nPbH, ++ const unsigned int avail, ++ HEVCRpiMvField * const mv, ++ const unsigned int mvp_lx_flag, const unsigned int LX); ++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase); ++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int log2_trafo_size, const int is_coded_block); ++int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot); ++ ++extern const uint8_t ff_hevc_rpi_qpel_extra_before[4]; ++extern const uint8_t ff_hevc_rpi_qpel_extra_after[4]; ++extern const uint8_t ff_hevc_rpi_qpel_extra[4]; ++ ++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n); ++ ++// arm/hevc_misc_neon.S ++// Neon coeff zap fn ++#if HAVE_NEON ++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2); ++#endif ++ ++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const HEVCRpiFrame * const ref, const int val, const int field); ++ ++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field); ++ ++// All of these expect that s->threads_type == FF_THREAD_FRAME ++ ++static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const HEVCRpiFrame * const ref, const int y) ++{ ++ if (s->threads_type != 0) ++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1); ++} ++ ++static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y) ++{ ++ if (s->used_for_ref && s->threads_type != 0) ++ ff_hevc_rpi_progress_signal_field(s, y, 1); ++} ++ ++static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const HEVCRpiFrame * const ref, const int y) ++{ ++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0); ++} ++ ++static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y) ++{ ++ if (s->used_for_ref && s->threads_type != 0) ++ { ++ ff_hevc_rpi_progress_signal_field(s, y, 0); ++ } ++} ++ ++static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s) ++{ ++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0); ++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1); ++} ++ ++ ++// Set all done - signal nothing (used in missing refs) ++// Works for both rpi & non-rpi ++static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref) ++{ ++ if (ref->tf.progress != NULL) ++ { ++ int * const p = (int *)ref->tf.progress->data; ++ p[0] = INT_MAX; ++ p[1] = INT_MAX; ++ } ++} ++ ++#define HEVC_RPI_420_ONLY 1 ++#define HEVC_RPI_SAND128_ONLY 1 ++ ++static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx) ++{ ++#if HEVC_RPI_420_ONLY ++ return cidx == 0 ? 0 : 1; ++#else ++ return s->ps.sps->hshift[cidx]; ++#endif ++} ++ ++static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx) ++{ ++#if HEVC_RPI_420_ONLY ++ return cidx == 0 ? 0 : 1; ++#else ++ return s->ps.sps->vshift[cidx]; ++#endif ++} ++ ++static inline int ctx_cfmt(const HEVCRpiContext * const s) ++{ ++#if HEVC_RPI_420_ONLY ++ return 1; ++#else ++ return s->ps.sps->chroma_format_idc; ++#endif ++} ++ ++static inline int frame_stride1(const AVFrame * const frame, const int c_idx) ++{ ++#if HEVC_RPI_SAND128_ONLY ++ return 128; ++#else ++ return frame->linesize[c_idx]; ++#endif ++} ++ ++#if HEVC_RPI_SAND128_ONLY ++// Propagate this decision to later zc includes ++#define RPI_ZC_SAND128_ONLY 1 ++#endif ++ ++#ifndef ff_hevc_rpi_copy_vert ++static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src) ++{ ++ int i; ++ switch (pixel_shift) ++ { ++ case 2: ++ for (i = 0; i < height; i++) { ++ *(uint32_t *)dst = *(uint32_t *)src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; ++ case 1: ++ for (i = 0; i < height; i++) { ++ *(uint16_t *)dst = *(uint16_t *)src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; ++ default: ++ for (i = 0; i < height; i++) { ++ *dst = *src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; ++ } ++} ++#endif ++ ++ ++#if MVF_STASH_WIDTH == 64 ++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x, const unsigned int y) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)); ++} ++ ++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int x, const unsigned int y) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ const unsigned int x0_ctb = x0 & mask_cs_hi; ++ const unsigned int y0_ctb = y0 & mask_cs_hi; ++ ++ return (HEVCRpiMvField *)((y < y0_ctb) ? ++ (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) : ++ (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) : ++ lc->mvf_stash + ++ ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ++ ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE))); ++} ++ ++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s, ++ const unsigned int x0, ++ const unsigned int x) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ const unsigned int x0_ctb = x0 & mask_cs_hi; ++ return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU; ++} ++ ++#else ++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x, const unsigned int y) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1))); ++} ++ ++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int x, const unsigned int y) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ ++ const unsigned int x0_ctb = x0 & mask_cs_hi; ++ const unsigned int y0_ctb = y0 & mask_cs_hi; ++ ++ // If not in the same CTB for Y assume up ++ if (y < y0_ctb) { ++ // If not in the same CTB for X too assume up-left ++ return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)); ++ } ++ return mvf_stash_ptr(s, lc, x, y); ++} ++ ++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s, ++ const unsigned int x0, ++ const unsigned int x) ++{ ++ return MVF_STASH_WIDTH_PU; ++} ++#endif ++ ++#endif /* AVCODEC_RPI_HEVCDEC_H */ +diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c +new file mode 100644 +index 0000000000..87f3cc9d14 +--- /dev/null ++++ b/libavcodec/rpi_hevcdsp.c +@@ -0,0 +1,450 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere ++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "rpi_hevcdsp.h" ++#include "rpi_hevc_mv.h" ++ ++static const int8_t transform[32][32] = { ++ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, ++ { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, ++ -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 }, ++ { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90, ++ -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 }, ++ { 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, ++ 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 }, ++ { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, ++ 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 }, ++ { 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, ++ -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 }, ++ { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, ++ -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 }, ++ { 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, ++ 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 }, ++ { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, ++ 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 }, ++ { 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, ++ -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 }, ++ { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, ++ -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 }, ++ { 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, ++ 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 }, ++ { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, ++ 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 }, ++ { 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, ++ -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 }, ++ { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, ++ -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 }, ++ { 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, ++ 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 }, ++ { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, ++ 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 }, ++ { 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, ++ -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 }, ++ { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57, ++ -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 }, ++ { 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, ++ 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 }, ++ { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, ++ 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 }, ++ { 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, ++ -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 }, ++ { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43, ++ -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 }, ++ { 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, ++ 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 }, ++ { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, ++ 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 }, ++ { 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, ++ -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 }, ++ { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, ++ -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 }, ++ { 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, ++ 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 }, ++ { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, ++ 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 }, ++ { 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, ++ -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 }, ++ { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, ++ -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 }, ++ { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, ++ 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 }, ++}; ++ ++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = { ++ { -2, 58, 10, -2}, ++ { -4, 54, 16, -2}, ++ { -6, 46, 28, -4}, ++ { -4, 36, 36, -4}, ++ { -4, 28, 46, -6}, ++ { -2, 16, 54, -4}, ++ { -2, 10, 58, -2}, ++}; ++ ++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = { ++ { -1, 4,-10, 58, 17, -5, 1, 0, -1, 4,-10, 58, 17, -5, 1, 0}, ++ { -1, 4,-11, 40, 40,-11, 4, -1, -1, 4,-11, 40, 40,-11, 4, -1}, ++ { 0, 1, -5, 17, 58,-10, 4, -1, 0, 1, -5, 17, 58,-10, 4, -1} ++}; ++ ++#define BIT_DEPTH 8 ++#include "rpi_hevcdsp_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 9 ++#include "rpi_hevcdsp_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 10 ++#include "rpi_hevcdsp_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 12 ++#include "rpi_hevcdsp_template.c" ++#undef BIT_DEPTH ++ ++static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, ++ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, ++ int in_inc0, int in_inc1) ++{ ++ int shift = 32; ++ uint32_t bs = 0; ++ for (; pus > 0; pus--) { ++ int strength, out; ++ int curr_refL0 = curr_rpl0[curr->ref_idx[0]]; ++ int curr_refL1 = curr_rpl1[curr->ref_idx[1]]; ++ int nr_idx0 = neigh->ref_idx[0]; ++ int nr_idx1 = neigh->ref_idx[1]; ++ int neigh_refL0 = neigh_rpl0[nr_idx0]; ++ int neigh_refL1 = neigh_rpl1[nr_idx1]; ++ ++ av_assert0(nr_idx0 >= 0 && nr_idx0 <=31); ++ av_assert0(nr_idx1 >= 0 && nr_idx1 <=31); ++ ++#if 1 // This more directly matches the original implementation ++ if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) { ++ // same L0 and L1 ++ if (curr_refL0 == neigh_refL0 && ++ curr_refL0 == curr_refL1 && ++ neigh_refL0 == neigh_refL1) { ++ if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 || ++ FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) && ++ (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 || ++ FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)) ++ strength = 1; ++ else ++ strength = 0; ++ } else if (neigh_refL0 == curr_refL0 && ++ neigh_refL1 == curr_refL1) { ++ if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 || ++ FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) ++ strength = 1; ++ else ++ strength = 0; ++ } else if (neigh_refL1 == curr_refL0 && ++ neigh_refL0 == curr_refL1) { ++ if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 || ++ FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4) ++ strength = 1; ++ else ++ strength = 0; ++ } else { ++ strength = 1; ++ } ++ } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV ++ MvXY curr_mv0, neigh_mv0; ++ ++ if (curr->pred_flag & 1) { ++ curr_mv0 = curr->xy[0]; ++ } else { ++ curr_mv0 = curr->xy[1]; ++ curr_refL0 = curr_refL1; ++ } ++ ++ if (neigh->pred_flag & 1) { ++ neigh_mv0 = neigh->xy[0]; ++ } else { ++ neigh_mv0 = neigh->xy[1]; ++ neigh_refL0 = neigh_refL1; ++ } ++ ++ if (curr_refL0 == neigh_refL0) { ++ if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4) ++ strength = 1; ++ else ++ strength = 0; ++ } else ++ strength = 1; ++ } else ++ strength = 1; ++#else // This has exactly the same effect, but is more suitable for vectorisation ++ MvXY curr_mv[2]; ++ MvXY neigh_mv[2]; ++ memcpy(curr_mv, curr->xy, sizeof curr_mv); ++ memcpy(neigh_mv, neigh->xy, sizeof neigh_mv); ++ ++ if (!(curr->pred_flag & 2)) { ++ curr_mv[1] = curr_mv[0]; ++ curr_refL1 = curr_refL0; ++ } ++ if (!(neigh->pred_flag & 2)) { ++ neigh_mv[1] = neigh_mv[0]; ++ neigh_refL1 = neigh_refL0; ++ } ++ if (!(curr->pred_flag & 1)) { ++ curr_mv[0] = curr_mv[1]; ++ curr_refL0 = curr_refL1; ++ } ++ if (!(neigh->pred_flag & 1)) { ++ neigh_mv[0] = neigh_mv[1]; ++ neigh_refL0 = neigh_refL1; ++ } ++ ++ strength = 1; ++ ++ strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) | ++ (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) | ++ (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4); ++ ++ strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) | ++ (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) | ++ (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4); ++ ++ strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2); ++#endif ++ ++ curr += in_inc0 / sizeof (HEVCRpiMvField); ++ neigh += in_inc1 / sizeof (HEVCRpiMvField); ++ ++ for (out = dup; out > 0; out--) ++ { ++ bs = (bs >> 2) | (strength << 30); ++ shift -= 2; ++ } ++ } ++ return bs >> shift; ++} ++ ++ ++static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height) ++{ ++ unsigned int i, j; ++ ++ if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) { ++ for (i = 0; i < height; i++) { ++ for (j = 0; j < width; j+=8) ++ AV_COPY64U(dst+j, src+j); ++ dst += stride_dst; ++ src += stride_src; ++ } ++ } else { ++ for (i = 0; i < height; i++) { ++ for (j = 0; j < width; j+=16) ++ AV_COPY128(dst+j, src+j); ++ dst += stride_dst; ++ src += stride_src; ++ } ++ } ++} ++ ++ ++ ++void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) ++{ ++#undef FUNC ++#define FUNC(a, depth) a ## _ ## depth ++ ++#undef PEL_FUNC ++#define PEL_FUNC(dst1, idx1, idx2, a, depth) \ ++ for(i = 0 ; i < 10 ; i++) \ ++{ \ ++ hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth; \ ++} ++ ++#undef EPEL_FUNCS ++#define EPEL_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth); \ ++ PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth); \ ++ PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth) ++ ++#undef EPEL_UNI_FUNCS ++#define EPEL_UNI_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth); \ ++ PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth); \ ++ PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth); \ ++ PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth); \ ++ PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth); \ ++ PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth) ++ ++#undef EPEL_BI_FUNCS ++#define EPEL_BI_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth); \ ++ PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth); \ ++ PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth); \ ++ PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth); \ ++ PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth); \ ++ PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth) ++ ++#undef QPEL_FUNCS ++#define QPEL_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth); \ ++ PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth); \ ++ PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth) ++ ++#undef QPEL_UNI_FUNCS ++#define QPEL_UNI_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth) ++ ++#undef QPEL_BI_FUNCS ++#define QPEL_BI_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth) ++ ++#define SLICED_ADD_RESIDUAL(depth)\ ++ hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \ ++ hevcdsp->add_residual_u[1] = FUNC(add_residual8x8_u, depth); \ ++ hevcdsp->add_residual_u[2] = FUNC(add_residual16x16_u, depth); \ ++ hevcdsp->add_residual_u[3] = FUNC(add_residual32x32_u, depth); \ ++ hevcdsp->add_residual_v[0] = FUNC(add_residual4x4_v, depth); \ ++ hevcdsp->add_residual_v[1] = FUNC(add_residual8x8_v, depth); \ ++ hevcdsp->add_residual_v[2] = FUNC(add_residual16x16_v, depth); \ ++ hevcdsp->add_residual_v[3] = FUNC(add_residual32x32_v, depth); \ ++ hevcdsp->add_residual_c[0] = FUNC(add_residual4x4_c, depth); \ ++ hevcdsp->add_residual_c[1] = FUNC(add_residual8x8_c, depth); \ ++ hevcdsp->add_residual_c[2] = FUNC(add_residual16x16_c, depth); \ ++ hevcdsp->add_residual_c[3] = FUNC(add_residual32x32_c, depth); \ ++ hevcdsp->add_residual_dc_c[0] = FUNC(add_residual4x4_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[1] = FUNC(add_residual8x8_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[2] = FUNC(add_residual16x16_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[3] = FUNC(add_residual32x32_dc_c, depth); \ ++ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth) ++#define SLICED_LOOP_FILTERS(depth)\ ++ hevcdsp->hevc_h_loop_filter_luma2 = FUNC(hevc_h_loop_filter_luma2, depth); \ ++ hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \ ++ hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \ ++ hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth) ++#define SLICED_SAO(depth)\ ++ for (i = 0; i != SAO_FILTER_N; ++i) { \ ++ hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth); \ ++ hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth); \ ++ } \ ++ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \ ++ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth) ++ ++#define HEVC_DSP(depth) \ ++ hevcdsp->put_pcm = FUNC(put_pcm, depth); \ ++ hevcdsp->add_residual[0] = FUNC(add_residual4x4, depth); \ ++ hevcdsp->add_residual[1] = FUNC(add_residual8x8, depth); \ ++ hevcdsp->add_residual[2] = FUNC(add_residual16x16, depth); \ ++ hevcdsp->add_residual[3] = FUNC(add_residual32x32, depth); \ ++ hevcdsp->add_residual_dc[0] = FUNC(add_residual4x4_dc, depth); \ ++ hevcdsp->add_residual_dc[1] = FUNC(add_residual8x8_dc, depth); \ ++ hevcdsp->add_residual_dc[2] = FUNC(add_residual16x16_dc, depth); \ ++ hevcdsp->add_residual_dc[3] = FUNC(add_residual32x32_dc, depth); \ ++ SLICED_ADD_RESIDUAL(depth); \ ++ hevcdsp->dequant = FUNC(dequant, depth); \ ++ hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \ ++ hevcdsp->transform_4x4_luma = FUNC(transform_4x4_luma, depth); \ ++ hevcdsp->idct[0] = FUNC(idct_4x4, depth); \ ++ hevcdsp->idct[1] = FUNC(idct_8x8, depth); \ ++ hevcdsp->idct[2] = FUNC(idct_16x16, depth); \ ++ hevcdsp->idct[3] = FUNC(idct_32x32, depth); \ ++ \ ++ hevcdsp->idct_dc[0] = FUNC(idct_4x4_dc, depth); \ ++ hevcdsp->idct_dc[1] = FUNC(idct_8x8_dc, depth); \ ++ hevcdsp->idct_dc[2] = FUNC(idct_16x16_dc, depth); \ ++ hevcdsp->idct_dc[3] = FUNC(idct_32x32_dc, depth); \ ++ \ ++ for (i = 0; i != SAO_FILTER_N; ++i) { \ ++ hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth); \ ++ hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth); \ ++ } \ ++ hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \ ++ hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \ ++ SLICED_SAO(depth); \ ++ \ ++ QPEL_FUNCS(depth); \ ++ QPEL_UNI_FUNCS(depth); \ ++ QPEL_BI_FUNCS(depth); \ ++ EPEL_FUNCS(depth); \ ++ EPEL_UNI_FUNCS(depth); \ ++ EPEL_BI_FUNCS(depth); \ ++ \ ++ SLICED_LOOP_FILTERS(depth); \ ++ hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \ ++ hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \ ++ hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \ ++ hevcdsp->hevc_v_loop_filter_chroma = FUNC(hevc_v_loop_filter_chroma, depth); \ ++ hevcdsp->hevc_h_loop_filter_luma_c = FUNC(hevc_h_loop_filter_luma, depth); \ ++ hevcdsp->hevc_v_loop_filter_luma_c = FUNC(hevc_v_loop_filter_luma, depth); \ ++ hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \ ++ hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth) ++int i = 0; ++ ++ switch (bit_depth) { ++ case 9: ++ HEVC_DSP(9); ++ break; ++ case 10: ++ HEVC_DSP(10); ++ break; ++ case 12: ++ HEVC_DSP(12); ++ break; ++ default: ++ HEVC_DSP(8); ++ break; ++ } ++ ++ hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths; ++ hevcdsp->cpy_blk = cpy_blk; ++ ++ if (ARCH_PPC) ++ ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth); ++ if (ARCH_X86) ++ ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth); ++ if (ARCH_ARM) ++ ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth); ++ if (ARCH_MIPS) ++ ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth); ++} +diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h +new file mode 100644 +index 0000000000..5a7cdeeb66 +--- /dev/null ++++ b/libavcodec/rpi_hevcdsp.h +@@ -0,0 +1,177 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere ++ * ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVCDSP_H ++#define AVCODEC_RPI_HEVCDSP_H ++ ++#include "hevc.h" ++#include "get_bits.h" ++ ++struct HEVCRpiMvField; ++ ++#define MAX_PB_SIZE 64 ++ ++#define RPI_HEVC_SAO_BUF_STRIDE 160 ++ ++ ++typedef struct RpiSAOParams { ++ uint8_t band_position[3]; ///< sao_band_position (Y,U,V) ++ uint8_t eo_class[3]; ///< sao_eo_class (Y,U=V) ++ uint8_t type_idx[3]; ///< sao_type_idx (Y,U=V) ++ ++ int16_t offset_val[3][5]; ///> 16; ++ const int dc_u = (dc << 16) >> 16; ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size * 2; x += 2) { ++ dst[x] = av_clip_pixel(dst[x] + dc_u); ++ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v); ++ } ++ dst += stride; ++ } ++} ++ ++ ++static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual)(_dst, res, stride, 4); ++} ++ ++static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual)(_dst, res, stride, 8); ++} ++ ++static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual)(_dst, res, stride, 16); ++} ++ ++static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual)(_dst, res, stride, 32); ++} ++ ++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 4); ++} ++ ++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 8); ++} ++ ++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 16); ++} ++ ++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 32); ++} ++ ++// -- U -- (plaited) ++ ++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) ++{ ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 4); ++} ++ ++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) ++{ ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 8); ++} ++ ++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) ++{ ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 16); ++} ++ ++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++// -- V -- (plaited) ++ ++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 4); ++} ++ ++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 8); ++} ++ ++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 16); ++} ++ ++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++// -- C -- (plaited - both U & V) ++ ++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_c)(_dst, res, stride, 4); ++} ++ ++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_c)(_dst, res, stride, 8); ++} ++ ++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_c)(_dst, res, stride, 16); ++} ++ ++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 4); ++} ++ ++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 8); ++} ++ ++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 16); ++} ++ ++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++ ++static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode) ++{ ++ int16_t *coeffs = (int16_t *) _coeffs; ++ int x, y; ++ int size = 1 << log2_size; ++ ++ if (mode) { ++ coeffs += size; ++ for (y = 0; y < size - 1; y++) { ++ for (x = 0; x < size; x++) ++ coeffs[x] += coeffs[x - size]; ++ coeffs += size; ++ } ++ } else { ++ for (y = 0; y < size; y++) { ++ for (x = 1; x < size; x++) ++ coeffs[x] += coeffs[x - 1]; ++ coeffs += size; ++ } ++ } ++} ++ ++static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size) ++{ ++ int shift = 15 - BIT_DEPTH - log2_size; ++ int x, y; ++ int size = 1 << log2_size; ++ ++ if (shift > 0) { ++ int offset = 1 << (shift - 1); ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size; x++) { ++ *coeffs = (*coeffs + offset) >> shift; ++ coeffs++; ++ } ++ } ++ } else { ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size; x++) { ++ *coeffs = *coeffs << -shift; ++ coeffs++; ++ } ++ } ++ } ++} ++ ++#define SET(dst, x) (dst) = (x) ++#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift) ++ ++#define TR_4x4_LUMA(dst, src, step, assign) \ ++ do { \ ++ int c0 = src[0 * step] + src[2 * step]; \ ++ int c1 = src[2 * step] + src[3 * step]; \ ++ int c2 = src[0 * step] - src[3 * step]; \ ++ int c3 = 74 * src[1 * step]; \ ++ \ ++ assign(dst[2 * step], 74 * (src[0 * step] - \ ++ src[2 * step] + \ ++ src[3 * step])); \ ++ assign(dst[0 * step], 29 * c0 + 55 * c1 + c3); \ ++ assign(dst[1 * step], 55 * c2 - 29 * c1 + c3); \ ++ assign(dst[3 * step], 55 * c0 + 29 * c2 - c3); \ ++ } while (0) ++ ++static void FUNC(transform_4x4_luma)(int16_t *coeffs) ++{ ++ int i; ++ int shift = 7; ++ int add = 1 << (shift - 1); ++ int16_t *src = coeffs; ++ ++ for (i = 0; i < 4; i++) { ++ TR_4x4_LUMA(src, src, 4, SCALE); ++ src++; ++ } ++ ++ shift = 20 - BIT_DEPTH; ++ add = 1 << (shift - 1); ++ for (i = 0; i < 4; i++) { ++ TR_4x4_LUMA(coeffs, coeffs, 1, SCALE); ++ coeffs += 4; ++ } ++} ++ ++#undef TR_4x4_LUMA ++ ++#define TR_4(dst, src, dstep, sstep, assign, end) \ ++ do { \ ++ const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \ ++ const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \ ++ const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \ ++ const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \ ++ \ ++ assign(dst[0 * dstep], e0 + o0); \ ++ assign(dst[1 * dstep], e1 + o1); \ ++ assign(dst[2 * dstep], e1 - o1); \ ++ assign(dst[3 * dstep], e0 - o0); \ ++ } while (0) ++ ++#define TR_8(dst, src, dstep, sstep, assign, end) \ ++ do { \ ++ int i, j; \ ++ int e_8[4]; \ ++ int o_8[4] = { 0 }; \ ++ for (i = 0; i < 4; i++) \ ++ for (j = 1; j < end; j += 2) \ ++ o_8[i] += transform[4 * j][i] * src[j * sstep]; \ ++ TR_4(e_8, src, 1, 2 * sstep, SET, 4); \ ++ \ ++ for (i = 0; i < 4; i++) { \ ++ assign(dst[i * dstep], e_8[i] + o_8[i]); \ ++ assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]); \ ++ } \ ++ } while (0) ++ ++#define TR_16(dst, src, dstep, sstep, assign, end) \ ++ do { \ ++ int i, j; \ ++ int e_16[8]; \ ++ int o_16[8] = { 0 }; \ ++ for (i = 0; i < 8; i++) \ ++ for (j = 1; j < end; j += 2) \ ++ o_16[i] += transform[2 * j][i] * src[j * sstep]; \ ++ TR_8(e_16, src, 1, 2 * sstep, SET, 8); \ ++ \ ++ for (i = 0; i < 8; i++) { \ ++ assign(dst[i * dstep], e_16[i] + o_16[i]); \ ++ assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]); \ ++ } \ ++ } while (0) ++ ++#define TR_32(dst, src, dstep, sstep, assign, end) \ ++ do { \ ++ int i, j; \ ++ int e_32[16]; \ ++ int o_32[16] = { 0 }; \ ++ for (i = 0; i < 16; i++) \ ++ for (j = 1; j < end; j += 2) \ ++ o_32[i] += transform[j][i] * src[j * sstep]; \ ++ TR_16(e_32, src, 1, 2 * sstep, SET, end / 2); \ ++ \ ++ for (i = 0; i < 16; i++) { \ ++ assign(dst[i * dstep], e_32[i] + o_32[i]); \ ++ assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]); \ ++ } \ ++ } while (0) ++ ++#define IDCT_VAR4(H) \ ++ int limit2 = FFMIN(col_limit + 4, H) ++#define IDCT_VAR8(H) \ ++ int limit = FFMIN(col_limit, H); \ ++ int limit2 = FFMIN(col_limit + 4, H) ++#define IDCT_VAR16(H) IDCT_VAR8(H) ++#define IDCT_VAR32(H) IDCT_VAR8(H) ++ ++#define IDCT(H) \ ++static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \ ++ int col_limit) \ ++{ \ ++ int i; \ ++ int shift = 7; \ ++ int add = 1 << (shift - 1); \ ++ int16_t *src = coeffs; \ ++ IDCT_VAR ## H(H); \ ++ \ ++ for (i = 0; i < H; i++) { \ ++ TR_ ## H(src, src, H, H, SCALE, limit2); \ ++ if (limit2 < H && i%4 == 0 && !!i) \ ++ limit2 -= 4; \ ++ src++; \ ++ } \ ++ \ ++ shift = 20 - BIT_DEPTH; \ ++ add = 1 << (shift - 1); \ ++ for (i = 0; i < H; i++) { \ ++ TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \ ++ coeffs += H; \ ++ } \ ++} ++ ++#define IDCT_DC(H) \ ++static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs) \ ++{ \ ++ int i, j; \ ++ int shift = 14 - BIT_DEPTH; \ ++ int add = 1 << (shift - 1); \ ++ int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift; \ ++ \ ++ for (j = 0; j < H; j++) { \ ++ for (i = 0; i < H; i++) { \ ++ coeffs[i + j * H] = coeff; \ ++ } \ ++ } \ ++} ++ ++IDCT( 4) ++IDCT( 8) ++IDCT(16) ++IDCT(32) ++ ++IDCT_DC( 4) ++IDCT_DC( 8) ++IDCT_DC(16) ++IDCT_DC(32) ++ ++#undef TR_4 ++#undef TR_8 ++#undef TR_16 ++#undef TR_32 ++ ++#undef SET ++#undef SCALE ++ ++static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, ++ int width, int height) ++{ ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int offset_table[32] = { 0 }; ++ int k, y, x; ++ int shift = BIT_DEPTH - 5; ++ ++ stride_dst /= sizeof(pixel); ++ stride_src /= sizeof(pixel); ++ ++ for (k = 0; k < 4; k++) ++ offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); ++ dst += stride_dst; ++ src += stride_src; ++ } ++} ++ ++#define CMP(a, b) (((a) > (b)) - ((a) < (b))) ++ ++static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, ++ int eo, int width, int height) { ++ ++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; ++ static const int8_t pos[4][2][2] = { ++ { { -1, 0 }, { 1, 0 } }, // horizontal ++ { { 0, -1 }, { 0, 1 } }, // vertical ++ { { -1, -1 }, { 1, 1 } }, // 45 degree ++ { { 1, -1 }, { -1, 1 } }, // 135 degree ++ }; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int a_stride, b_stride; ++ int x, y; ++ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel); ++ stride_dst /= sizeof(pixel); ++ ++ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src; ++ b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ int diff0 = CMP(src[x], src[x + a_stride]); ++ int diff1 = CMP(src[x], src[x + b_stride]); ++ int offset_val = edge_idx[2 + diff0 + diff1]; ++ dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]); ++ } ++ src += stride_src; ++ dst += stride_dst; ++ } ++} ++ ++ ++#if BIT_DEPTH == 10 ++// We need a 32 bit variation for the _c restores so hijack bit depth 10 ++#undef pixel ++#undef BIT_DEPTH ++#define pixel uint32_t ++#define BIT_DEPTH 32 ++// All 16 bit variations are the same ++#define sao_edge_restore_0_10 sao_edge_restore_0_9 ++#define sao_edge_restore_1_10 sao_edge_restore_1_9 ++#define sao_edge_restore_0_11 sao_edge_restore_0_9 ++#define sao_edge_restore_1_11 sao_edge_restore_1_9 ++#define sao_edge_restore_0_12 sao_edge_restore_0_9 ++#define sao_edge_restore_1_12 sao_edge_restore_1_9 ++#define sao_edge_restore_0_13 sao_edge_restore_0_9 ++#define sao_edge_restore_1_13 sao_edge_restore_1_9 ++#define sao_edge_restore_0_14 sao_edge_restore_0_9 ++#define sao_edge_restore_1_14 sao_edge_restore_1_9 ++#define sao_edge_restore_0_15 sao_edge_restore_0_9 ++#define sao_edge_restore_1_15 sao_edge_restore_1_9 ++#define sao_edge_restore_0_16 sao_edge_restore_0_9 ++#define sao_edge_restore_1_16 sao_edge_restore_1_9 ++#endif ++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32 ++static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao, ++ int *borders, int _width, int _height, ++ int c_idx, uint8_t *vert_edge, ++ uint8_t *horiz_edge, uint8_t *diag_edge) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int sao_eo_class = sao->eo_class[c_idx]; ++ int init_x = 0, width = _width, height = _height; ++ ++ stride_dst /= sizeof(pixel); ++ stride_src /= sizeof(pixel); ++ ++ if (sao_eo_class != SAO_EO_VERT) { ++ if (borders[0]) { ++ for (y = 0; y < height; y++) { ++ dst[y * stride_dst] = src[y * stride_src]; ++ } ++ init_x = 1; ++ } ++ if (borders[2]) { ++ int offset = width - 1; ++ for (x = 0; x < height; x++) { ++ dst[x * stride_dst + offset] = src[x * stride_src + offset]; ++ } ++ width--; ++ } ++ } ++ if (sao_eo_class != SAO_EO_HORIZ) { ++ if (borders[1]) { ++ for (x = init_x; x < width; x++) ++ dst[x] = src[x]; ++ } ++ if (borders[3]) { ++ ptrdiff_t y_stride_dst = stride_dst * (height - 1); ++ ptrdiff_t y_stride_src = stride_src * (height - 1); ++ for (x = init_x; x < width; x++) ++ dst[x + y_stride_dst] = src[x + y_stride_src]; ++ height--; ++ } ++ } ++} ++ ++static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao, ++ int *borders, int _width, int _height, ++ int c_idx, uint8_t *vert_edge, ++ uint8_t *horiz_edge, uint8_t *diag_edge) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int sao_eo_class = sao->eo_class[c_idx]; ++ int init_x = 0, init_y = 0, width = _width, height = _height; ++ ++ stride_dst /= sizeof(pixel); ++ stride_src /= sizeof(pixel); ++ ++ if (sao_eo_class != SAO_EO_VERT) { ++ if (borders[0]) { ++ for (y = 0; y < height; y++) { ++ dst[y * stride_dst] = src[y * stride_src]; ++ } ++ init_x = 1; ++ } ++ if (borders[2]) { ++ int offset = width - 1; ++ for (x = 0; x < height; x++) { ++ dst[x * stride_dst + offset] = src[x * stride_src + offset]; ++ } ++ width--; ++ } ++ } ++ if (sao_eo_class != SAO_EO_HORIZ) { ++ if (borders[1]) { ++ for (x = init_x; x < width; x++) ++ dst[x] = src[x]; ++ init_y = 1; ++ } ++ if (borders[3]) { ++ ptrdiff_t y_stride_dst = stride_dst * (height - 1); ++ ptrdiff_t y_stride_src = stride_src * (height - 1); ++ for (x = init_x; x < width; x++) ++ dst[x + y_stride_dst] = src[x + y_stride_src]; ++ height--; ++ } ++ } ++ ++ { ++ int save_upper_left = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1]; ++ int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D && !borders[1] && !borders[2]; ++ int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3]; ++ int save_lower_left = !diag_edge[3] && sao_eo_class == SAO_EO_45D && !borders[0] && !borders[3]; ++ ++ // Restore pixels that can't be modified ++ if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) { ++ for(y = init_y+save_upper_left; y< height-save_lower_left; y++) ++ dst[y*stride_dst] = src[y*stride_src]; ++ } ++ if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) { ++ for(y = init_y+save_upper_right; y< height-save_lower_right; y++) ++ dst[y*stride_dst+width-1] = src[y*stride_src+width-1]; ++ } ++ ++ if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) { ++ for(x = init_x+save_upper_left; x < width-save_upper_right; x++) ++ dst[x] = src[x]; ++ } ++ if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) { ++ for(x = init_x+save_lower_left; x < width-save_lower_right; x++) ++ dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x]; ++ } ++ if(diag_edge[0] && sao_eo_class == SAO_EO_135D) ++ dst[0] = src[0]; ++ if(diag_edge[1] && sao_eo_class == SAO_EO_45D) ++ dst[width-1] = src[width-1]; ++ if(diag_edge[2] && sao_eo_class == SAO_EO_135D) ++ dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1]; ++ if(diag_edge[3] && sao_eo_class == SAO_EO_45D) ++ dst[stride_dst*(height-1)] = src[stride_src*(height-1)]; ++ ++ } ++} ++#endif ++#if BIT_DEPTH == 32 ++#undef BIT_DEPTH ++#undef pixel ++#define BIT_DEPTH 10 ++#define pixel uint16_t ++#endif ++ ++// --- Plaited chroma versions ++ ++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) ++{ ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int offset_table_u[32] = { 0 }; ++ int offset_table_v[32] = { 0 }; ++ int k, y, x; ++ int shift = BIT_DEPTH - 5; ++ ++ stride_dst /= sizeof(pixel); ++ stride_src /= sizeof(pixel); ++ width *= 2; ++ ++ for (k = 0; k < 4; k++) ++ { ++ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1]; ++ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1]; ++ } ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x += 2) ++ { ++// printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift); ++// printf("offsets=%x,%x\n", src[x + 0], src[x + 1]); ++ // *** & 31 shouldn't be wanted but just now we generate broken input that ++ // crashes us in 10-bit world ++ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]); ++ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]); ++ } ++ dst += stride_dst; ++ src += stride_src; ++ } ++} ++ ++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, ++ int eo, int width, int height) { ++ ++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; ++ static const int8_t pos[4][2][2] = { ++ { { -1, 0 }, { 1, 0 } }, // horizontal ++ { { 0, -1 }, { 0, 1 } }, // vertical ++ { { -1, -1 }, { 1, 1 } }, // 45 degree ++ { { 1, -1 }, { -1, 1 } }, // 135 degree ++ }; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int a_stride, b_stride; ++ int x, y; ++ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel); ++ ++ stride_dst /= sizeof(pixel); ++ width *= 2; ++ ++ av_assert0(width <= 64); ++ ++ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src; ++ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x += 2) { ++ int diff0u = CMP(src[x], src[x + a_stride]); ++ int diff1u = CMP(src[x], src[x + b_stride]); ++ int offset_valu = edge_idx[2 + diff0u + diff1u]; ++ int diff0v = CMP(src[x+1], src[x+1 + a_stride]); ++ int diff1v = CMP(src[x+1], src[x+1 + b_stride]); ++ int offset_valv = edge_idx[2 + diff0v + diff1v]; ++ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]); ++ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]); ++ } ++ src += stride_src; ++ dst += stride_dst; ++ } ++} ++ ++// Do once ++#if BIT_DEPTH == 8 ++// Any old 2 byte 'normal' restore will work for these ++#define sao_edge_restore_c_0_8 sao_edge_restore_0_16 ++#define sao_edge_restore_c_1_8 sao_edge_restore_1_16 ++// We need 32 bit for 9 bit+ ++#define sao_edge_restore_c_0_9 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_9 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32 ++#endif ++ ++#undef CMP ++ ++//////////////////////////////////////////////////////////////////////////////// ++// ++//////////////////////////////////////////////////////////////////////////////// ++static void FUNC(put_hevc_pel_pixels)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = src[x] << (14 - BIT_DEPTH); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ for (y = 0; y < height; y++) { ++ memcpy(dst, src, width * sizeof(pixel)); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1)); ++ } ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++//////////////////////////////////////////////////////////////////////////////// ++// ++//////////////////////////////////////////////////////////////////////////////// ++#define QPEL_FILTER(src, stride) \ ++ (filter[0] * src[x - 3 * stride] + \ ++ filter[1] * src[x - 2 * stride] + \ ++ filter[2] * src[x - stride] + \ ++ filter[3] * src[x ] + \ ++ filter[4] * src[x + stride] + \ ++ filter[5] * src[x + 2 * stride] + \ ++ filter[6] * src[x + 3 * stride] + \ ++ filter[7] * src[x + 4 * stride]) ++ ++static void FUNC(put_hevc_qpel_h)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_v)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_hv)(int16_t *dst, ++ uint8_t *_src, ++ ptrdiff_t _srcstride, ++ int height, intptr_t mx, ++ intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6; ++ tmp += MAX_PB_SIZE; ++ dst += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ int shift = 14 - BIT_DEPTH; ++ ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ int shift = 14 - BIT_DEPTH; ++ ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++ ++static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 - BIT_DEPTH; ++ ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, ++ intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, ++ intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, ++ intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++//////////////////////////////////////////////////////////////////////////////// ++// ++//////////////////////////////////////////////////////////////////////////////// ++#define EPEL_FILTER(src, stride) \ ++ (filter[0] * src[x - stride] + \ ++ filter[1] * src[x] + \ ++ filter[2] * src[x + stride] + \ ++ filter[3] * src[x + 2 * stride]) ++ ++static void FUNC(put_hevc_epel_h)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_v)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_hv)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ ++ src -= EPEL_EXTRA_BEFORE * srcstride; ++ ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6; ++ tmp += MAX_PB_SIZE; ++ dst += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int shift = 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); ++ } ++ dst += dststride; ++ src += srcstride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; ++ int shift = 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); ++ dst += dststride; ++ src += srcstride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ src -= EPEL_EXTRA_BEFORE * srcstride; ++ ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ src -= EPEL_EXTRA_BEFORE * srcstride; ++ ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); ++ } ++ dst += dststride; ++ src += srcstride; ++ } ++} ++ ++static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); ++ } ++ dst += dststride; ++ src += srcstride; ++ } ++} ++ ++static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ src -= EPEL_EXTRA_BEFORE * srcstride; ++ ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; ++ ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ src -= EPEL_EXTRA_BEFORE * srcstride; ++ ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++// line zero ++#define P3 pix[-4 * xstride] ++#define P2 pix[-3 * xstride] ++#define P1 pix[-2 * xstride] ++#define P0 pix[-1 * xstride] ++#define Q0 pix[0 * xstride] ++#define Q1 pix[1 * xstride] ++#define Q2 pix[2 * xstride] ++#define Q3 pix[3 * xstride] ++ ++// line three. used only for deblocking decision ++#define TP3 pix[-4 * xstride + 3 * ystride] ++#define TP2 pix[-3 * xstride + 3 * ystride] ++#define TP1 pix[-2 * xstride + 3 * ystride] ++#define TP0 pix[-1 * xstride + 3 * ystride] ++#define TQ0 pix[0 * xstride + 3 * ystride] ++#define TQ1 pix[1 * xstride + 3 * ystride] ++#define TQ2 pix[2 * xstride + 3 * ystride] ++#define TQ3 pix[3 * xstride + 3 * ystride] ++ ++static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, ++ ptrdiff_t _xstride, ptrdiff_t _ystride, ++ int beta, int *_tc, ++ uint8_t *_no_p, uint8_t *_no_q) ++{ ++ int d, j; ++ pixel *pix = (pixel *)_pix; ++ ptrdiff_t xstride = _xstride / sizeof(pixel); ++ ptrdiff_t ystride = _ystride / sizeof(pixel); ++ ++ beta <<= BIT_DEPTH - 8; ++ ++ for (j = 0; j < 2; j++) { ++ const int dp0 = abs(P2 - 2 * P1 + P0); ++ const int dq0 = abs(Q2 - 2 * Q1 + Q0); ++ const int dp3 = abs(TP2 - 2 * TP1 + TP0); ++ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); ++ const int d0 = dp0 + dq0; ++ const int d3 = dp3 + dq3; ++ const int tc = _tc[j] << (BIT_DEPTH - 8); ++ const int no_p = _no_p[j]; ++ const int no_q = _no_q[j]; ++ ++ if (d0 + d3 >= beta) { ++ pix += 4 * ystride; ++ continue; ++ } else { ++ const int beta_3 = beta >> 3; ++ const int beta_2 = beta >> 2; ++ const int tc25 = ((tc * 5 + 1) >> 1); ++ ++ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 && ++ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 && ++ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) { ++ // strong filtering ++ const int tc2 = tc << 1; ++ for (d = 0; d < 4; d++) { ++ const int p3 = P3; ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ const int q3 = Q3; ++ if (!no_p) { ++ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2); ++ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); ++ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2); ++ } ++ if (!no_q) { ++ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2); ++ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); ++ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2); ++ } ++ pix += ystride; ++ } ++ } else { // normal filtering ++ int nd_p = 1; ++ int nd_q = 1; ++ const int tc_2 = tc >> 1; ++ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3)) ++ nd_p = 2; ++ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3)) ++ nd_q = 2; ++ ++ for (d = 0; d < 4; d++) { ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4; ++ if (abs(delta0) < 10 * tc) { ++ delta0 = av_clip(delta0, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ if (!no_p && nd_p > 1) { ++ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2); ++ P1 = av_clip_pixel(p1 + deltap1); ++ } ++ if (!no_q && nd_q > 1) { ++ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); ++ Q1 = av_clip_pixel(q1 + deltaq1); ++ } ++ } ++ pix += ystride; ++ } ++ } ++ } ++ } ++} ++ ++static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride, ++ ptrdiff_t _ystride, int *_tc, ++ uint8_t *_no_p, uint8_t *_no_q) ++{ ++ int d, j, no_p, no_q; ++ pixel *pix = (pixel *)_pix; ++ ptrdiff_t xstride = _xstride / sizeof(pixel); ++ ptrdiff_t ystride = _ystride / sizeof(pixel); ++ ++ for (j = 0; j < 2; j++) { ++ const int tc = _tc[j] << (BIT_DEPTH - 8); ++ if (tc <= 0) { ++ pix += 4 * ystride; ++ continue; ++ } ++ no_p = _no_p[j]; ++ no_q = _no_q[j]; ++ ++ for (d = 0; d < 4; d++) { ++ int delta0; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ pix += ystride; ++ } ++ } ++} ++ ++static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, ++ int32_t *tc, uint8_t *no_p, ++ uint8_t *no_q) ++{ ++ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q); ++} ++ ++static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, ++ int32_t *tc, uint8_t *no_p, ++ uint8_t *no_q) ++{ ++ FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q); ++} ++ ++static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, ++ int beta, int32_t *tc, uint8_t *no_p, ++ uint8_t *no_q) ++{ ++ FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel), ++ beta, tc, no_p, no_q); ++} ++ ++static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, ++ int beta, int32_t *tc, uint8_t *no_p, ++ uint8_t *no_q) ++{ ++ FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride, ++ beta, tc, no_p, no_q); ++} ++ ++#undef P3 ++#undef P2 ++#undef P1 ++#undef P0 ++#undef Q0 ++#undef Q1 ++#undef Q2 ++#undef Q3 ++ ++#undef TP3 ++#undef TP2 ++#undef TP1 ++#undef TP0 ++#undef TQ0 ++#undef TQ1 ++#undef TQ2 ++#undef TQ3 ++ ++// line zero ++#define P3 pix_l[0 * xstride] ++#define P2 pix_l[1 * xstride] ++#define P1 pix_l[2 * xstride] ++#define P0 pix_l[3 * xstride] ++#define Q0 pix_r[0 * xstride] ++#define Q1 pix_r[1 * xstride] ++#define Q2 pix_r[2 * xstride] ++#define Q3 pix_r[3 * xstride] ++ ++// line three. used only for deblocking decision ++#define TP3 pix_l[0 * xstride + 3 * ystride] ++#define TP2 pix_l[1 * xstride + 3 * ystride] ++#define TP1 pix_l[2 * xstride + 3 * ystride] ++#define TP0 pix_l[3 * xstride + 3 * ystride] ++#define TQ0 pix_r[0 * xstride + 3 * ystride] ++#define TQ1 pix_r[1 * xstride + 3 * ystride] ++#define TQ2 pix_r[2 * xstride + 3 * ystride] ++#define TQ3 pix_r[3 * xstride + 3 * ystride] ++ ++// This is identical to hevc_loop_filter_luma except that the P/Q ++// components are on separate pointers ++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f, ++ uint8_t * _pix_l) ++{ ++ int d, j; ++ pixel *pix_l = (pixel *)_pix_l; ++ pixel *pix_r = (pixel *)_pix_r; ++ const ptrdiff_t xstride = 1; ++ const ptrdiff_t ystride = _stride / sizeof(pixel); ++ ++ beta <<= BIT_DEPTH - 8; ++ ++ for (j = 0; j < 2; j++) { ++ const int dp0 = abs(P2 - 2 * P1 + P0); ++ const int dq0 = abs(Q2 - 2 * Q1 + Q0); ++ const int dp3 = abs(TP2 - 2 * TP1 + TP0); ++ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); ++ const int d0 = dp0 + dq0; ++ const int d3 = dp3 + dq3; ++ const int tc = ((tc2 >> (j << 4)) & 0xffff) << (BIT_DEPTH - 8); ++ const int no_p = no_f & 1; ++ const int no_q = no_f & 2; ++ ++ if (d0 + d3 >= beta) { ++ pix_l += 4 * ystride; ++ pix_r += 4 * ystride; ++ continue; ++ } else { ++ const int beta_3 = beta >> 3; ++ const int beta_2 = beta >> 2; ++ const int tc25 = ((tc * 5 + 1) >> 1); ++ ++ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 && ++ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 && ++ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) { ++ // strong filtering ++ const int tc2 = tc << 1; ++ for (d = 0; d < 4; d++) { ++ const int p3 = P3; ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ const int q3 = Q3; ++ if (!no_p) { ++ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2); ++ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); ++ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2); ++ } ++ if (!no_q) { ++ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2); ++ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); ++ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2); ++ } ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } else { // normal filtering ++ int nd_p = 1; ++ int nd_q = 1; ++ const int tc_2 = tc >> 1; ++ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3)) ++ nd_p = 2; ++ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3)) ++ nd_q = 2; ++ ++ for (d = 0; d < 4; d++) { ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4; ++ if (abs(delta0) < 10 * tc) { ++ delta0 = av_clip(delta0, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ if (!no_p && nd_p > 1) { ++ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2); ++ P1 = av_clip_pixel(p1 + deltap1); ++ } ++ if (!no_q && nd_q > 1) { ++ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); ++ Q1 = av_clip_pixel(q1 + deltaq1); ++ } ++ } ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } ++ } ++ } ++} ++ ++static void FUNC(hevc_h_loop_filter_luma2)(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f) ++{ ++ // Just call the non-2 function having massaged the parameters ++ int32_t tc[2] = {tc2 & 0xffff, tc2 >> 16}; ++ uint8_t no_p[2] = {no_f & 1, no_f & 1}; ++ uint8_t no_q[2] = {no_f & 2, no_f & 2}; ++ FUNC(hevc_h_loop_filter_luma)(_pix_r, _stride, beta, tc, no_p, no_q); ++} ++ ++#undef TP3 ++#undef TP2 ++#undef TP1 ++#undef TP0 ++#undef TQ0 ++#undef TQ1 ++#undef TQ2 ++#undef TQ3 ++ ++#undef P3 ++#undef P2 ++#undef P1 ++#undef P0 ++#undef Q0 ++#undef Q1 ++#undef Q2 ++#undef Q3 ++ ++#define P1 pix_l[0 * xstride] ++#define P0 pix_l[1 * xstride] ++#define Q0 pix_r[0 * xstride] ++#define Q1 pix_r[1 * xstride] ++ ++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride, ++ ptrdiff_t _ystride, const int32_t *_tc, ++ const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r) ++{ ++ int d, j, no_p, no_q; ++ pixel *pix_l = (pixel *)_pix_l; ++ pixel *pix_r = (pixel *)_pix_r; ++ ptrdiff_t xstride = _xstride / sizeof(pixel); ++ ptrdiff_t ystride = _ystride / sizeof(pixel); ++ ++ for (j = 0; j < 2; j++) { ++ const int tc = _tc[j] << (BIT_DEPTH - 8); ++ if (tc <= 0) { ++ pix_l += 4 * ystride; ++ pix_r += 4 * ystride; ++ continue; ++ } ++ no_p = _no_p[j]; ++ no_q = _no_q[j]; ++ ++ for (d = 0; d < 4; d++) { ++ int delta0; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } ++} ++ ++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4, ++ unsigned int no_f) ++{ ++ uint8_t no_p[2] = {no_f & 1, no_f & 2}; ++ uint8_t no_q[2] = {no_f & 4, no_f & 8}; ++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; ++ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q); ++ FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q); ++} ++ ++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f) ++{ ++ uint8_t no_p[2] = {no_f & 1, no_f & 2}; ++ uint8_t no_q[2] = {no_f & 4, no_f & 8}; ++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; ++ FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r); ++ FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel)); ++} ++ ++#undef P1 ++#undef P0 ++#undef Q0 ++#undef Q1 ++ +diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c +new file mode 100644 +index 0000000000..0aa8809a4b +--- /dev/null ++++ b/libavcodec/rpi_hevcpred.c +@@ -0,0 +1,161 @@ ++/* ++ * HEVC video Decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "rpi_hevcdec.h" ++ ++#include "rpi_hevcpred.h" ++#if (ARCH_ARM) ++#include "arm/rpi_hevcpred_arm.h" ++#endif ++ ++#define PRED_C 0 ++#define BIT_DEPTH 8 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 9 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 10 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 12 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++#undef PRED_C ++ ++#define PRED_C 1 ++#define BIT_DEPTH 8 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 9 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 10 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 12 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++#undef PRED_C ++ ++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth) ++{ ++#undef FUNC ++#define FUNC(a, depth) a ## _ ## depth ++ ++#undef FUNCC ++#define FUNCC(a, depth) a ## _ ## depth ## _c ++ ++#define HEVC_PRED_Y(depth) \ ++ hpc->intra_pred = FUNC(intra_pred, depth); \ ++ hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \ ++ hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \ ++ hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \ ++ hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \ ++ hpc->pred_planar[0] = FUNC(pred_planar_0, depth); \ ++ hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \ ++ hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \ ++ hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \ ++ hpc->pred_dc[0] = FUNC(pred_dc_0, depth); \ ++ hpc->pred_dc[1] = FUNC(pred_dc_1, depth); \ ++ hpc->pred_dc[2] = FUNC(pred_dc_2, depth); \ ++ hpc->pred_dc[3] = FUNC(pred_dc_3, depth); \ ++ hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \ ++ hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \ ++ hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \ ++ hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \ ++ hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \ ++ hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \ ++ hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \ ++ hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \ ++ hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \ ++ hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \ ++ hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \ ++ hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \ ++ hpc->pred_dc0[0] = FUNC(pred_dc0_0, depth); \ ++ hpc->pred_dc0[1] = FUNC(pred_dc0_1, depth); \ ++ hpc->pred_dc0[2] = FUNC(pred_dc0_2, depth); \ ++ hpc->pred_dc0[3] = FUNC(pred_dc0_3, depth); ++ ++#define HEVC_PRED_C(depth) \ ++ hpc->intra_pred_c = FUNCC(intra_pred, depth); \ ++ hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \ ++ hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \ ++ hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \ ++ hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \ ++ hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \ ++ hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \ ++ hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \ ++ hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \ ++ hpc->pred_dc_c[0] = FUNCC(pred_dc_0, depth); \ ++ hpc->pred_dc_c[1] = FUNCC(pred_dc_1, depth); \ ++ hpc->pred_dc_c[2] = FUNCC(pred_dc_2, depth); \ ++ hpc->pred_dc_c[3] = FUNCC(pred_dc_3, depth); \ ++ hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \ ++ hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \ ++ hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \ ++ hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \ ++ hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \ ++ hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \ ++ hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \ ++ hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \ ++ hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \ ++ hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \ ++ hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \ ++ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \ ++ hpc->pred_dc0_c[0] = FUNCC(pred_dc0_0, depth); \ ++ hpc->pred_dc0_c[1] = FUNCC(pred_dc0_1, depth); \ ++ hpc->pred_dc0_c[2] = FUNCC(pred_dc0_2, depth); \ ++ hpc->pred_dc0_c[3] = FUNCC(pred_dc0_3, depth); ++ ++#define HEVC_PRED(depth) \ ++ HEVC_PRED_Y(depth); \ ++ HEVC_PRED_C(depth); ++ ++ switch (bit_depth) { ++ case 9: ++ HEVC_PRED(9); ++ break; ++ case 10: ++ HEVC_PRED(10); ++ break; ++ case 12: ++ HEVC_PRED(12); ++ break; ++ default: ++ HEVC_PRED(8); ++ break; ++ } ++ ++#if (ARCH_ARM) ++ ff_hevc_rpi_pred_init_arm(hpc, bit_depth); ++#elif (ARCH_MIPS) ++ ff_hevc_rpi_pred_init_mips(hpc, bit_depth); ++#endif ++} +diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h +new file mode 100644 +index 0000000000..9f0edb8798 +--- /dev/null ++++ b/libavcodec/rpi_hevcpred.h +@@ -0,0 +1,123 @@ ++/* ++ * HEVC video Decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVCPRED_H ++#define AVCODEC_RPI_HEVCPRED_H ++ ++#include ++#include ++#include "config.h" ++ ++struct HEVCRpiContext; ++struct HEVCRpiLocalContext; ++ ++enum IntraPredMode { ++ INTRA_PLANAR = 0, ++ INTRA_DC, ++ INTRA_ANGULAR_2, ++ INTRA_ANGULAR_3, ++ INTRA_ANGULAR_4, ++ INTRA_ANGULAR_5, ++ INTRA_ANGULAR_6, ++ INTRA_ANGULAR_7, ++ INTRA_ANGULAR_8, ++ INTRA_ANGULAR_9, ++ INTRA_ANGULAR_10, ++ INTRA_ANGULAR_11, ++ INTRA_ANGULAR_12, ++ INTRA_ANGULAR_13, ++ INTRA_ANGULAR_14, ++ INTRA_ANGULAR_15, ++ INTRA_ANGULAR_16, ++ INTRA_ANGULAR_17, ++ INTRA_ANGULAR_18, ++ INTRA_ANGULAR_19, ++ INTRA_ANGULAR_20, ++ INTRA_ANGULAR_21, ++ INTRA_ANGULAR_22, ++ INTRA_ANGULAR_23, ++ INTRA_ANGULAR_24, ++ INTRA_ANGULAR_25, ++ INTRA_ANGULAR_26, ++ INTRA_ANGULAR_27, ++ INTRA_ANGULAR_28, ++ INTRA_ANGULAR_29, ++ INTRA_ANGULAR_30, ++ INTRA_ANGULAR_31, ++ INTRA_ANGULAR_32, ++ INTRA_ANGULAR_33, ++ INTRA_ANGULAR_34, ++}; ++#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10 ++#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26 ++ ++typedef void intra_filter_fn_t( ++ uint8_t * const left, uint8_t * const top, ++ const unsigned int req, const unsigned int avail, ++ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, ++ const unsigned int stride, ++ const unsigned int top_right_size, const unsigned int down_left_size); ++ ++typedef struct HEVCRpiPredContext { ++ void (*intra_pred)(const struct HEVCRpiContext * const s, ++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, ++ const unsigned int avail, const unsigned int log2_size); ++ ++ intra_filter_fn_t *intra_filter[4]; ++ void (*pred_planar[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride); ++ void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ++ ptrdiff_t stride); ++ void (*pred_angular[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_vertical[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride); ++ ++ void (*intra_pred_c)(const struct HEVCRpiContext * const s, ++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, ++ const unsigned int avail, const unsigned int log2_size); ++ intra_filter_fn_t *intra_filter_c[4]; ++ void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride); ++ void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ++ ptrdiff_t stride); ++ void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride); ++} HEVCRpiPredContext; ++ ++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth); ++ ++#endif /* AVCODEC_RPI_HEVCPRED_H */ +diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c +new file mode 100644 +index 0000000000..f2ebcad332 +--- /dev/null ++++ b/libavcodec/rpi_hevcpred_template.c +@@ -0,0 +1,1407 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "config.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/rpi_sand_fns.h" ++#include "bit_depth_template.c" ++ ++#include "rpi_hevcdec.h" ++#include "rpi_hevcpred.h" ++ ++#define DUMP_PRED 0 ++ ++#define POS(x, y) src[(x) + stride * (y)] ++ ++// INCLUDED_ONCE defined at EOF ++#ifndef INCLUDED_ONCE ++typedef uint8_t (* c8_dst_ptr_t)[2]; ++typedef const uint8_t (* c8_src_ptr_t)[2]; ++typedef uint16_t (* c16_dst_ptr_t)[2]; ++typedef const uint16_t (* c16_src_ptr_t)[2]; ++ ++// *** On ARM make these NEON registers ++typedef struct pixel4_16 { ++ uint16_t x[4]; ++} pixel4_16; ++typedef struct pixel4_32 { ++ uint32_t x[4]; ++} pixel4_32; ++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x) ++{ ++ pixel4_16 t = {{x, x, x, x}}; ++ return t; ++} ++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x) ++{ ++ pixel4_32 t = {{x, x, x, x}}; ++ return t; ++} ++#endif ++ ++#if PRED_C ++// For chroma we double pixel size so we copy pairs ++#undef pixel ++#undef pixel2 ++#undef pixel4 ++#undef dctcoef ++#undef INIT_CLIP ++#undef no_rnd_avg_pixel4 ++#undef rnd_avg_pixel4 ++#undef AV_RN2P ++#undef AV_RN4P ++#undef AV_RN4PA ++#undef AV_WN2P ++#undef AV_WN4P ++#undef AV_WN4PA ++#undef CLIP ++#undef FUNC ++#undef FUNCC ++#undef av_clip_pixel ++#undef PIXEL_SPLAT_X4 ++ ++#if BIT_DEPTH == 8 ++#define pixel uint16_t ++#define pixel4 pixel4_16 ++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16 ++#define cpel uint8_t ++#define c_src_ptr_t c8_src_ptr_t ++#define c_dst_ptr_t c8_dst_ptr_t ++#else ++#define pixel uint32_t ++#define pixel4 pixel4_32 ++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32 ++#define cpel uint16_t ++#define c_src_ptr_t c16_dst_ptr_t ++#define c_dst_ptr_t c16_dst_ptr_t ++#endif ++#define AV_RN4P(p) (*(pixel4*)(p)) ++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x)) ++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c) ++#endif ++ ++ ++// Get PW prior to horrid PRED_C trickery ++#if BIT_DEPTH == 8 ++#define PW 1 ++#else ++#define PW 2 ++#endif ++ ++ ++#if DUMP_PRED && !defined(INCLUDED_ONCE) ++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size) ++{ ++ for (unsigned int y = 0; y != size; y++, data += stride * 2) { ++ for (unsigned int x = 0; x != size; x++) { ++ printf("%4d", data[x * 2]); ++ } ++ printf("\n"); ++ } ++ printf("\n"); ++} ++#endif ++ ++#ifndef INCLUDED_ONCE ++static inline void extend_8(void * ptr, const unsigned int v, unsigned int n) ++{ ++ if ((n >>= 2) != 0) { ++ uint32_t v4 = v | (v << 8); ++ uint32_t * p = (uint32_t *)ptr; ++ v4 = v4 | (v4 << 16); ++ do { ++ *p++ = v4; ++ } while (--n != 0); ++ } ++} ++ ++static inline void extend_16(void * ptr, const unsigned int v, unsigned int n) ++{ ++ if ((n >>= 2) != 0) { ++ uint32_t v2 = v | (v << 16); ++ uint32_t * p = (uint32_t *)ptr; ++ do { ++ *p++ = v2; ++ *p++ = v2; ++ } while (--n != 0); ++ } ++} ++ ++static inline void extend_32(void * ptr, const unsigned int v, unsigned int n) ++{ ++ if ((n >>= 2) != 0) { ++ uint32_t * p = (uint32_t *)ptr; ++ do { ++ *p++ = v; ++ *p++ = v; ++ *p++ = v; ++ *p++ = v; ++ } while (--n != 0); ++ } ++} ++ ++// Beware that this inverts the avail ordering ++// For CIP it seems easier this way round ++static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask, ++ const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size, ++ unsigned int s0, unsigned int odd_s) ++{ ++ const unsigned int n = 1 << log2_intra_bits; ++ unsigned int fa = 0; ++ unsigned int i; ++ ++ size >>= 2; // Now in 4-pel units ++ s0 >>= 2; ++ ++ if ((avail & AVAIL_DL) != 0) ++ fa |= ((1 << s0) - 1) << (size - s0); ++ if ((avail & AVAIL_L) != 0) ++ fa |= ((1 << size) - 1) << size; ++ if ((avail & AVAIL_UL) != 0) ++ fa |= 1 << (size << 1); ++ ++ if (odd_s) { ++ if ((fa & 1) != 0 && (*is_intra & i_mask) == 0) ++ fa &= ~1; ++ is_intra += i_stride; ++ } ++ ++ for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) { ++ const unsigned int m = ((1 << n) - 1) << i; ++ if ((fa & m) != 0 && (*is_intra & i_mask) == 0) ++ fa &= ~m; ++ } ++ ++ return fa; ++} ++ ++static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift, ++ const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size, ++ unsigned int s1, unsigned int odd_s) ++{ ++ if ((avail & (AVAIL_U | AVAIL_UR)) == 0) ++ { ++ return 0; ++ } ++ else ++ { ++ const unsigned int n = 1 << log2_intra_bits; ++ unsigned int fa = 0; ++ unsigned int i; ++ unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift; ++ ++ size >>= 2; // Now in 4-pel units ++ s1 >>= 2; ++ ++ if ((avail & AVAIL_U) != 0) ++ fa |= ((1 << size) - 1); ++ if ((avail & AVAIL_UR) != 0) ++ fa |= ((1 << s1) - 1) << size; ++ ++ if (odd_s) { ++ fa &= im | ~1; ++ im >>= 1; ++ } ++ ++ for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) { ++ const unsigned int m = ((1 << n) - 1) << i; ++ if ((im & 1) == 0) ++ fa &= ~m; ++ } ++ return fa; ++ } ++} ++ ++ ++ ++static inline unsigned int rmbd(unsigned int x) ++{ ++#if 1 ++ return __builtin_ctz(x); ++#else ++ unsigned int n = 0; ++ if ((x & 0xffff) == 0) { ++ x >>= 16; ++ n += 16; ++ } ++ if ((x & 0xff) == 0) { ++ x >>= 8; ++ n += 8; ++ } ++ if ((x & 0xf) == 0) { ++ x >>= 4; ++ n += 4; ++ } ++ if ((x & 0x3) == 0) { ++ x >>= 2; ++ n += 2; ++ } ++ ++ return (x & 1) == 0 ? n + 1 : n; ++#endif ++} ++#endif ++ ++ ++static void FUNC(cip_fill)(pixel * const left, pixel * const top, ++ const unsigned int avail_l, const unsigned int avail_u, ++ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur, ++ const unsigned int stride, ++ const unsigned int size) ++{ ++ pixel a; ++ unsigned int i; ++ ++ // 1st find DL value ++ if ((avail_l & 1) == 0) { ++ if (avail_l != 0) ++ a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride]; ++ else ++ { ++ // (avail_l | avail_u) != 0 so this must be good ++ const unsigned int n = rmbd(avail_u)*4; ++ a = (n >= size) ? src_ur[n - size] : src_u[n]; ++ } ++ } ++ ++ // L ++ { ++ pixel * d = left + size * 2 - 1; ++ const pixel * s = src_l + (size * 2 - 1) * stride; ++ unsigned int x = avail_l; ++ for (i = 0; i < size * 2; i += 4, x >>= 1) ++ { ++ if ((x & 1) != 0) { ++ // Avail ++ *d-- = *s; ++ s -= stride; ++ *d-- = *s; ++ s -= stride; ++ *d-- = *s; ++ s -= stride; ++ *d-- = a = *s; ++ s -= stride; ++ } ++ else ++ { ++ *d-- = a; ++ *d-- = a; ++ *d-- = a; ++ *d-- = a; ++ s -= stride * 4; ++ } ++ } ++ // UL ++ *d = a = (x & 1) != 0 ? *s : a; ++ } ++ ++ // U ++ { ++ pixel * d = top; ++ const pixel * s = src_u; ++ unsigned int x = avail_u; ++ ++ for (i = 0; i < size; i += 4, x >>= 1) ++ { ++ if ((x & 1) != 0) { ++ // Avail ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = a = *s++; ++ } ++ else ++ { ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ s += 4; ++ } ++ } ++ ++ // UR ++ s = src_ur; ++ for (i = 0; i < size; i += 4, x >>= 1) ++ { ++ if ((x & 1) != 0) { ++ // Avail ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = a = *s++; ++ } ++ else ++ { ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ s += 4; ++ } ++ } ++ } ++} ++ ++ ++#if !PRED_C && PW == 1 ++#define EXTEND(ptr, val, len) extend_8(ptr, val, len) ++#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1) ++#define EXTEND(ptr, val, len) extend_16(ptr, val, len) ++#else ++#define EXTEND(ptr, val, len) extend_32(ptr, val, len) ++#endif ++ ++// Reqs: ++// ++// Planar: DL[0], L, ul, U, UR[0] ++// DC: dl, L, ul, U, ur ++// A2-9: DL, L, ul, u, ur ++// A10: dl, L, ul, u, ur ++// A11-17 dl, L, UL, U, ur ++// A18-25 dl, L, Ul, U, ur ++// A26 dl, l, ul, U, ur ++// A27-34 dl, l, ul, U, UR ++ ++#ifndef INCLUDED_ONCE ++ ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16; ++ ++static const uint8_t req_avail_c[35] = ++{ ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L, // 2 ++ AVAIL_DL | AVAIL_L, // 3 ++ AVAIL_DL | AVAIL_L, // 4 ++ AVAIL_DL | AVAIL_L, // 5 ++ AVAIL_DL | AVAIL_L, // 6 ++ AVAIL_DL | AVAIL_L, // 7 ++ AVAIL_DL | AVAIL_L, // 8 ++ AVAIL_DL | AVAIL_L, // 9 ++ AVAIL_L, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 25 ++ AVAIL_U, // 26 (V) ++ AVAIL_U | AVAIL_UR, // 27 ++ AVAIL_U | AVAIL_UR, // 28 ++ AVAIL_U | AVAIL_UR, // 29 ++ AVAIL_U | AVAIL_UR, // 30 ++ AVAIL_U | AVAIL_UR, // 31 ++ AVAIL_U | AVAIL_UR, // 32 ++ AVAIL_U | AVAIL_UR, // 33 ++ AVAIL_U | AVAIL_UR // 34 ++}; ++ ++static const uint8_t req_avail[4][35] = { ++{ ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L, // 2 ++ AVAIL_DL | AVAIL_L, // 3 ++ AVAIL_DL | AVAIL_L, // 4 ++ AVAIL_DL | AVAIL_L, // 5 ++ AVAIL_DL | AVAIL_L, // 6 ++ AVAIL_DL | AVAIL_L, // 7 ++ AVAIL_DL | AVAIL_L, // 8 ++ AVAIL_DL | AVAIL_L, // 9 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 25 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 26 (V) ++ AVAIL_U | AVAIL_UR, // 27 ++ AVAIL_U | AVAIL_UR, // 28 ++ AVAIL_U | AVAIL_UR, // 29 ++ AVAIL_U | AVAIL_UR, // 30 ++ AVAIL_U | AVAIL_UR, // 31 ++ AVAIL_U | AVAIL_UR, // 32 ++ AVAIL_U | AVAIL_UR, // 33 ++ AVAIL_U | AVAIL_UR // 34 ++}, ++{ // 3 ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2 ++ AVAIL_DL | AVAIL_L | 0, // 3 ++ AVAIL_DL | AVAIL_L | 0, // 4 ++ AVAIL_DL | AVAIL_L | 0, // 5 ++ AVAIL_DL | AVAIL_L | 0, // 6 ++ AVAIL_DL | AVAIL_L | 0, // 7 ++ AVAIL_DL | AVAIL_L | 0, // 8 ++ AVAIL_DL | AVAIL_L | 0, // 9 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V) ++ AVAIL_U | AVAIL_UR | 0, // 27 ++ AVAIL_U | AVAIL_UR | 0, // 28 ++ AVAIL_U | AVAIL_UR | 0, // 29 ++ AVAIL_U | AVAIL_UR | 0, // 30 ++ AVAIL_U | AVAIL_UR | 0, // 31 ++ AVAIL_U | AVAIL_UR | 0, // 32 ++ AVAIL_U | AVAIL_UR | 0, // 33 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34 ++}, ++{ // 4 ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 3 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 4 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 5 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 6 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 7 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 8 ++ AVAIL_DL | AVAIL_L | 0, // 9 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V) ++ AVAIL_U | AVAIL_UR | 0, // 27 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 28 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 29 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 30 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 31 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 32 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 33 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34 ++}, ++{ // 5 ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 2 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 3 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 4 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 5 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 6 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 7 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 8 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 9 ++ AVAIL_L | 0, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 25 ++ AVAIL_U | 0, // 26 (V) ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER // 34 ++} ++}; ++ ++ ++#endif ++ ++#define filter_light1 FUNC(filter_light1) ++static inline pixel filter_light1(pixel a, pixel b, pixel c) ++{ ++ return (a + b*2 + c + 2) >> 2; ++} ++ ++#define filter_light FUNC(filter_light) ++static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n) ++{ ++ pixel p0; ++ pixel p2 = *src; ++ // Allow for final pel - it is just clearer to to have the call take the actual number of output pels ++ unsigned int n_minus_1 = n - 1; ++ ++ do ++ { ++ src += sstride; ++ p0 = p1; ++ p1 = p2; ++ p2 = *src; ++ *dst++ = filter_light1(p0, p1, p2); ++ } while (--n_minus_1 != 0); ++ *dst = filter_light1(p1, p2, pn); ++} ++ ++#define filter_strong FUNC(filter_strong) ++static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n) ++{ ++ unsigned int a = 64 * p0 + 32; ++ const int v = p1 - p0; ++ ++ do ++ { ++ *dst++ = (a += v) >> 6; ++ } while (--n != 0); ++} ++ ++#define intra_filter FUNC(intra_filter) ++static av_always_inline void intra_filter( ++ pixel * const left, pixel * const top, ++ const unsigned int req, const unsigned int avail, ++ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur, ++ const unsigned int stride, ++ const unsigned int top_right_size, const unsigned int down_left_size, ++ const unsigned int log2_size) ++{ ++ const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5); ++ const unsigned int size = 1 << log2_size; ++ ++ // a_ is the first pel in a section working round dl -> ur ++ // b_ is the last ++ // Beware that top & left work out from UL so usage of a_ & b_ may ++ // swap between them. It is a bad naming scheme but I have found no ++ // better ++ const pixel * a_dl = src_l + (down_left_size + size - 1) * stride; ++ const pixel * b_dl = src_l + size * stride; ++ const pixel * a_l = src_l + (size - 1) * stride; ++ const pixel * b_l = src_l; ++ const pixel * ab_ul = src_l - stride; ++ const pixel * a_u = src_u; ++ const pixel * b_u = src_u + size - 1; ++ const pixel * a_ur = src_ur; ++ const pixel * b_ur = src_ur + top_right_size - 1; ++ ++ const unsigned int want = req & ~avail; ++ const unsigned int have = req & avail; ++ unsigned int i; ++ ++ if ((avail & AVAIL_DL) == 0) ++ { ++ a_dl = a_ur; ++ if ((avail & AVAIL_U) != 0) ++ a_dl = a_u; ++ if ((avail & AVAIL_UL) != 0) ++ a_dl = ab_ul; ++ if ((avail & AVAIL_L) != 0) ++ a_dl = a_l; ++ b_dl = a_dl; ++ } ++ ++ if ((avail & AVAIL_L) == 0) ++ { ++ a_l = b_dl; ++ b_l = b_dl; ++ } ++ if ((avail & AVAIL_UL) == 0) ++ { ++ ab_ul = b_l; ++ } ++ if ((avail & AVAIL_U) == 0) ++ { ++ a_u = ab_ul; ++ b_u = ab_ul; ++ } ++ if ((avail & AVAIL_UR) == 0) ++ { ++ a_ur = b_u; ++ b_ur = b_u; ++ } ++ ++ if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2) // PRED_C, log2_size compiler opt hints ++ { ++ if ((req & AVAIL_UL) != 0) ++ left[-1] = *ab_ul; ++ ++ if ((want & AVAIL_L) != 0) ++ EXTEND(left, *a_l, size); ++ if ((want & AVAIL_DL) != 0) ++ EXTEND(left + size, *a_dl, size); ++ if ((want & AVAIL_U) != 0) ++ EXTEND(top, *a_u, size); ++ if ((want & AVAIL_UR) != 0) ++ EXTEND(top + size, *a_ur, size); ++ ++ if ((have & AVAIL_U) != 0) ++ // Always good - even with sand ++ memcpy(top, a_u, size * sizeof(pixel)); ++ if ((have & AVAIL_UR) != 0) ++ { ++ memcpy(top + size, a_ur, top_right_size * sizeof(pixel)); ++ EXTEND(top + size + top_right_size, *b_ur, ++ size - top_right_size); ++ } ++ if ((have & AVAIL_L) != 0) ++ { ++ for (i = 0; i < size; i++) ++ left[i] = b_l[stride * i]; ++ } ++ if ((have & AVAIL_DL) != 0) ++ { ++ for (i = 0; i < down_left_size; i++) ++ left[i + size] = b_dl[stride * i]; ++ EXTEND(left + size + down_left_size, *a_dl, ++ size - down_left_size); ++ } ++ } ++ else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint ++ FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold && ++ FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold) ++ { ++ if ((req & (AVAIL_U | AVAIL_UR)) != 0) ++ filter_strong(top, *ab_ul, *b_ur, size * 2); ++ left[-1] = *ab_ul; ++ if ((req & (AVAIL_L | AVAIL_DL)) != 0) ++ filter_strong(left, *ab_ul, *a_dl, size*2); ++ } ++ else ++ { ++ // Same code for both have & want for UL ++ if ((req & AVAIL_UL) != 0) ++ { ++ left[-1] = filter_light1(*b_l, *ab_ul, *a_u); ++ } ++ ++ if ((want & AVAIL_L) != 0) ++ { ++ EXTEND(left, *a_l, size); ++ left[0] = (*a_l * 3 + *ab_ul + 2) >> 2; ++ } ++ if ((want & AVAIL_DL) != 0) ++ { ++ // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding ++ EXTEND(left + size, *a_l, size); ++ } ++ if ((want & AVAIL_U) != 0) ++ { ++ EXTEND(top, *a_u, size); ++ top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2; ++ } ++ if ((want & AVAIL_UR) != 0) ++ { ++ // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding ++ EXTEND(top + size, *a_ur, size); ++ } ++ ++ if ((have & AVAIL_U) != 0) ++ { ++ filter_light(top, *ab_ul, a_u, *a_ur, 1, size); ++ } ++ if ((have & AVAIL_UR) != 0) { ++ filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size); ++ top[size*2 - 1] = *b_ur; ++ EXTEND(top + size + top_right_size, *b_ur, size - top_right_size); ++ } ++ if ((have & AVAIL_L) != 0) ++ { ++ filter_light(left, *ab_ul, b_l, *b_dl, stride, size); ++ } ++ if ((have & AVAIL_DL) != 0) ++ { ++ filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size); ++ left[size*2 - 1] = *a_dl; ++ EXTEND(left + size + down_left_size, *a_dl, size - down_left_size); ++ } ++ } ++} ++ ++#define INTRA_FILTER(log2_size) \ ++static void FUNC(intra_filter_ ## log2_size)( \ ++ uint8_t * const left, uint8_t * const top, \ ++ const unsigned int req, const unsigned int avail, \ ++ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \ ++ const unsigned int stride, \ ++ const unsigned int top_right_size, const unsigned int down_left_size) \ ++{ \ ++ intra_filter((pixel *)left, (pixel *)top, req, avail, \ ++ (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \ ++} ++ ++INTRA_FILTER(2) ++INTRA_FILTER(3) ++INTRA_FILTER(4) ++INTRA_FILTER(5) ++ ++#undef intra_filter ++#undef INTRA_FILTER ++ ++static void FUNC(intra_pred)(const HEVCRpiContext * const s, ++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail, ++ const unsigned int log2_size) ++{ ++ // c_idx will alaways be 1 for _c versions and 0 for y ++ const unsigned int c_idx = PRED_C; ++ const unsigned int hshift = ctx_hshift(s, c_idx); ++ const unsigned int vshift = ctx_vshift(s, c_idx); ++ const unsigned int size = (1 << log2_size); ++ const unsigned int x = x0 >> hshift; ++ const unsigned int y = y0 >> vshift; ++ ++ const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel); ++ pixel *const src = c_idx == 0 ? ++ (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) : ++ (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y); ++ ++ // Align so we can do multiple loads in the asm ++ // Padded to 16 byte boundary so as not to confuse anything ++ DECLARE_ALIGNED(16, pixel, top[2 * MAX_TB_SIZE]); ++ DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); ++ ++ pixel * const left = left_array + 16 / sizeof(pixel); ++ const pixel * top_pred = top; ++ ++ const pixel * src_l = src - 1; ++ const pixel * src_u = src - stride; ++ const pixel * src_ur = src_u + size; ++#if !PRED_C ++ const unsigned int req = req_avail[log2_size - 2][mode] & ~s->ps.sps->intra_filters_disable; ++#else ++ const unsigned int req = req_avail_c[mode]; ++#endif ++ ++ // If we have nothing to pred from then fill with grey ++ // This isn't a common case but dealing with it here means we don't have to ++ // test for it later ++ if (avail == 0) ++ { ++dc_only: ++#if !PRED_C ++ s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride); ++#else ++ s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride); ++#endif ++ return; ++ } ++ ++ { ++ // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs ++ const AVFrame * const frame = s->frame; ++ const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2 ++ const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride; ++ if ((x & mask) == 0) ++ src_l -= stripe_adj; ++ if (((x + size) & mask) == 0) ++ src_ur += stripe_adj; ++ } ++ ++ // Can deal with I-slices in 'normal' code even if CIP ++ // This also means that we don't need to generate (elsewhere) is_intra ++ // for IRAP frames ++ if (s->ps.pps->constrained_intra_pred_flag == 1 && ++ s->sh.slice_type != HEVC_SLICE_I) ++ { ++ // * If we ever actually care about CIP performance then we should ++ // special case out size 4 stuff (can be done by 'normal') and ++ // have 8-pel avail masks ++ unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)), ++ -(int)(s->ps.sps->pcm_width), ++ 1 << (((x - 1) >> (3 - hshift)) & 7), ++ 1 - hshift, ++ avail, ++ size, ++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size), ++ vshift != 0 ? 0 : (y >> 2) & 1); ++ ++ unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)), ++ (x >> (3 - hshift)) & 7, ++ 1 - hshift, ++ avail, ++ size, ++ FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size), ++ hshift != 0 ? 0 : (x >> 2) & 1); ++ ++ // Anything left? ++ if ((avail_l | avail_u) == 0) ++ goto dc_only; ++ ++ FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size); ++ ++#if !PRED_C ++ if ((req & FILTER_LIGHT) != 0) ++ { ++ const unsigned threshold = 1 << (BIT_DEPTH - 5); ++ if ((req & FILTER_STRONG) != 0 && ++ (int)(FFABS(left[-1] + top[63] - 2 * top[31])) < threshold && ++ (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold) ++ { ++ filter_strong(top, left[-1], top[63], 64); ++ filter_strong(left, left[-1], left[63], 64); ++ } else ++ { ++ // LHS writes UL too so copy for top ++ const pixel p_ul = left[-1]; ++ filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size); ++ filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1); ++ } ++ } ++#endif ++ } ++ else ++ { ++ const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size); ++ if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 && ++ ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size)) ++ { ++ top_pred = src_u; ++ } ++ else ++ { ++#if !PRED_C ++ s->hpc.intra_filter[log2_size - 2] ++#else ++ s->hpc.intra_filter_c[log2_size - 2] ++#endif ++ ((uint8_t *)left, (uint8_t *)top, req, avail, ++ (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel), ++ ur_size, ++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size)); ++ } ++ } ++ ++ ++#if !PRED_C ++ switch (mode) { ++ case INTRA_PLANAR: ++ s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_DC: ++ s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_ANGULAR_HORIZONTAL: ++ s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ case INTRA_ANGULAR_VERTICAL: ++ s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ default: ++ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ } ++#else ++ switch (mode) { ++ case INTRA_PLANAR: ++ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_DC: ++ s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_ANGULAR_HORIZONTAL: ++ s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ case INTRA_ANGULAR_VERTICAL: ++ s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ default: ++ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ } ++ ++#if DUMP_PRED ++ printf("U pred @ %d, %d: mode=%d\n", x, y, mode); ++ dump_pred_uv((uint8_t *)src, stride, 1 << log2_size); ++ printf("V pred @ %d, %d: mode=%d\n", x, y, mode); ++ dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size); ++#endif ++#endif ++} ++ ++#if !PRED_C ++static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top, ++ const uint8_t *_left, ptrdiff_t stride, ++ int trafo_size) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ const pixel *top = (const pixel *)_top; ++ const pixel *left = (const pixel *)_left; ++ int size = 1 << trafo_size; ++ for (y = 0; y < size; y++) ++ for (x = 0; x < size; x++) ++ POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] + ++ (size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1); ++} ++#else ++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top, ++ const uint8_t * _left, ptrdiff_t stride, ++ int trafo_size) ++{ ++ int x, y; ++ int size = 1 << trafo_size; ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ const c_src_ptr_t top = (c_src_ptr_t)_top; ++ const c_src_ptr_t left = (c_src_ptr_t)_left; ++ ++ for (y = 0; y < size; y++, src += stride) ++ { ++ for (x = 0; x < size; x++) ++ { ++ src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0] + ++ (size - 1 - y) * top[x][0] + (y + 1) * left[size][0] + size) >> (trafo_size + 1); ++ src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1] + ++ (size - 1 - y) * top[x][1] + (y + 1) * left[size][1] + size) >> (trafo_size + 1); ++ } ++ } ++} ++#endif ++ ++#define PRED_PLANAR(size)\ ++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \ ++ const uint8_t *left, ptrdiff_t stride) \ ++{ \ ++ FUNC(pred_planar)(src, top, left, stride, size + 2); \ ++} ++ ++PRED_PLANAR(0) ++PRED_PLANAR(1) ++PRED_PLANAR(2) ++PRED_PLANAR(3) ++ ++#undef PRED_PLANAR ++ ++#if !PRED_C ++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, int log2_size) ++{ ++ int i, j, x, y; ++ int size = (1 << log2_size); ++ pixel *src = (pixel *)_src; ++ const pixel *top = (const pixel *)_top; ++ const pixel *left = (const pixel *)_left; ++ int dc = size; ++ pixel4 a; ++ for (i = 0; i < size; i++) ++ dc += left[i] + top[i]; ++ ++ dc >>= log2_size + 1; ++ ++ a = PIXEL_SPLAT_X4(dc); ++ ++ for (i = 0; i < size; i++) ++ for (j = 0; j < size; j+=4) ++ AV_WN4P(&POS(j, i), a); ++ ++// if (c_idx == 0 && size < 32) ++// As we now have separate fns for y & c - no need to test that ++ if (size < 32) ++ { ++ POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2; ++ for (x = 1; x < size; x++) ++ POS(x, 0) = (top[x] + 3 * dc + 2) >> 2; ++ for (y = 1; y < size; y++) ++ POS(0, y) = (left[y] + 3 * dc + 2) >> 2; ++ } ++} ++#else ++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, int log2_size) ++{ ++ unsigned int i, j; ++ const unsigned int size = (1 << log2_size); ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ const c_src_ptr_t top = (c_src_ptr_t)_top; ++ const c_src_ptr_t left = (c_src_ptr_t)_left; ++ unsigned int dc0 = size; ++ unsigned int dc1 = size; ++ ++ for (i = 0; i < size; i++) ++ { ++ dc0 += left[i][0] + top[i][0]; ++ dc1 += left[i][1] + top[i][1]; ++ } ++ ++ dc0 >>= log2_size + 1; ++ dc1 >>= log2_size + 1; ++ ++ for (i = 0; i < size; i++, src += stride) ++ { ++ for (j = 0; j < size; ++j) ++ { ++ src[j][0] = dc0; ++ src[j][1] = dc1; ++ ++ } ++ } ++} ++#endif ++ ++#define PRED_DC(size)\ ++static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top, \ ++ const uint8_t *left, ptrdiff_t stride) \ ++{ \ ++ FUNC(pred_dc)(src, top, left, stride, size + 2); \ ++} ++ ++PRED_DC(0) ++PRED_DC(1) ++PRED_DC(2) ++PRED_DC(3) ++ ++#undef PRED_DC ++ ++ ++ ++ ++#if !PRED_C ++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size) ++{ ++ int i, j; ++ int size = (1 << log2_size); ++ pixel *src = (pixel *)_src; ++ pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1)); ++ ++ for (i = 0; i < size; i++) ++ for (j = 0; j < size; j+=4) ++ AV_WN4P(&POS(j, i), a); ++} ++#else ++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size) ++{ ++ unsigned int i, j; ++ const unsigned int size = (1 << log2_size); ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ const pixel a = (1 << (BIT_DEPTH - 1)); ++ ++ for (i = 0; i < size; i++, src += stride) ++ { ++ for (j = 0; j < size; ++j) ++ { ++ src[j][0] = a; ++ src[j][1] = a; ++ } ++ } ++} ++#endif ++ ++#define PRED_DC0(size)\ ++static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride) \ ++{ \ ++ FUNC(pred_dc0)(src, stride, size + 2); \ ++} ++ ++PRED_DC0(0) ++PRED_DC0(1) ++PRED_DC0(2) ++PRED_DC0(3) ++ ++#undef PRED_DC0 ++ ++ ++ ++ ++#ifndef ANGLE_CONSTS ++#define ANGLE_CONSTS ++static const int intra_pred_angle[] = { ++ 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32, ++ -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 ++}; ++static const int inv_angle[] = { ++ -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482, ++ -630, -910, -1638, -4096 ++}; ++#endif ++ ++#if !PRED_C ++static av_always_inline void FUNC(pred_angular)(uint8_t *_src, ++ const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, ++ int mode, int size) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ const pixel *top = (const pixel *)_top; ++ const pixel *left = (const pixel *)_left; ++ ++ int angle = intra_pred_angle[mode - 2]; ++ pixel ref_array[3 * MAX_TB_SIZE + 4]; ++ pixel *ref_tmp = ref_array + size; ++ const pixel *ref; ++ int last = (size * angle) >> 5; ++ ++ if (mode >= 18) { ++ ref = top - 1; ++ ++ if (angle < 0) ++ { ++ memcpy(ref_tmp + 1, top, size * PW); ++ ref_tmp[0] = left[-1]; ++ ++ for (x = last; x <= -1; x++) ++ ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)]; ++ ref = ref_tmp; ++ } ++ ++ for (y = 0; y < size; y++) { ++ int idx = ((y + 1) * angle) >> 5; ++ int fact = ((y + 1) * angle) & 31; ++ if (fact) { ++ for (x = 0; x < size; x += 4) { ++ POS(x , y) = ((32 - fact) * ref[x + idx + 1] + ++ fact * ref[x + idx + 2] + 16) >> 5; ++ POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] + ++ fact * ref[x + 1 + idx + 2] + 16) >> 5; ++ POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] + ++ fact * ref[x + 2 + idx + 2] + 16) >> 5; ++ POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] + ++ fact * ref[x + 3 + idx + 2] + 16) >> 5; ++ } ++ } else { ++ for (x = 0; x < size; x += 4) ++ AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1])); ++ } ++ } ++ if (mode == 26 && size < 32) { ++ for (y = 0; y < size; y++) ++ POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1)); ++ } ++ ++ } else { ++ ref = left - 1; ++ if (angle < 0 && last < -1) { ++ for (x = 0; x <= size; x += 4) ++ AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1])); ++ // Inv angle <= -256 so top offset >= 0 ++ for (x = last; x <= -1; x++) ++ ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)]; ++ ref = ref_tmp; ++ } ++ ++ for (x = 0; x < size; x++) { ++ int idx = ((x + 1) * angle) >> 5; ++ int fact = ((x + 1) * angle) & 31; ++ if (fact) { ++ for (y = 0; y < size; y++) { ++ POS(x, y) = ((32 - fact) * ref[y + idx + 1] + ++ fact * ref[y + idx + 2] + 16) >> 5; ++ } ++ } else { ++ for (y = 0; y < size; y++) ++ POS(x, y) = ref[y + idx + 1]; ++ } ++ } ++ if (mode == 10 && size < 32) { ++ for (x = 0; x < size; x += 4) { ++ POS(x, 0) = av_clip_pixel(left[0] + ((top[x ] - left[-1]) >> 1)); ++ POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - left[-1]) >> 1)); ++ POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - left[-1]) >> 1)); ++ POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - left[-1]) >> 1)); ++ } ++ } ++ } ++} ++#else ++static av_always_inline void FUNC(pred_angular)(uint8_t *_src, ++ const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, ++ int mode, int size) ++{ ++ int x, y; ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ c_src_ptr_t top = (c_src_ptr_t)_top; ++ c_src_ptr_t left = (c_src_ptr_t)_left; ++ ++ const int angle = intra_pred_angle[mode - 2]; ++ cpel ref_array[3 * MAX_TB_SIZE + 4][2]; ++ c_dst_ptr_t ref_tmp = ref_array + size; ++ c_src_ptr_t ref; ++ const int last = (size * angle) >> 5; ++ ++ if (mode >= 18) { ++ ref = top - 1; ++ if (angle < 0) { ++ memcpy(ref_tmp + 1, top, size * 2 * PW); ++ ref_tmp[0][0] = left[-1][0]; ++ ref_tmp[0][1] = left[-1][1]; ++ for (x = last; x <= -1; x++) ++ { ++ ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; ++ ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; ++ } ++ ref = (c_src_ptr_t)ref_tmp; ++ } ++ ++ for (y = 0; y < size; y++, src += stride) { ++ const int idx = ((y + 1) * angle) >> 5; ++ const int fact = ((y + 1) * angle) & 31; ++ if (fact) { ++ for (x = 0; x < size; ++x) { ++ src[x][0] = ((32 - fact) * ref[x + idx + 1][0] + ++ fact * ref[x + idx + 2][0] + 16) >> 5; ++ src[x][1] = ((32 - fact) * ref[x + idx + 1][1] + ++ fact * ref[x + idx + 2][1] + 16) >> 5; ++ } ++ } else { ++ memcpy(src, ref + idx + 1, size * 2 * PW); ++ } ++ } ++ } else { ++ ref = left - 1; ++ if (angle < 0 && last < -1) { ++ memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW); ++ for (x = last; x <= -1; x++) ++ { ++ ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; ++ ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; ++ } ++ ref = (c_src_ptr_t)ref_tmp; ++ } ++ ++ for (x = 0; x < size; x++, src++) { ++ const int idx = ((x + 1) * angle) >> 5; ++ const int fact = ((x + 1) * angle) & 31; ++ if (fact) { ++ for (y = 0; y < size; y++) { ++ src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] + ++ fact * ref[y + idx + 2][0] + 16) >> 5; ++ src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] + ++ fact * ref[y + idx + 2][1] + 16) >> 5; ++ } ++ } else { ++ for (y = 0; y < size; y++) ++ { ++ src[y * stride][0] = ref[y + idx + 1][0]; ++ src[y * stride][1] = ref[y + idx + 1][1]; ++ } ++ } ++ } ++ } ++} ++#endif ++ ++static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ++ ptrdiff_t stride, int mode) ++{ ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2); ++} ++ ++static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ++ ptrdiff_t stride, int mode) ++{ ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3); ++} ++ ++static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ++ ptrdiff_t stride, int mode) ++{ ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4); ++} ++ ++static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ++ ptrdiff_t stride, int mode) ++{ ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5); ++} ++ ++#undef cpel ++#undef c_src_ptr_t ++#undef c_dst_ptr_t ++ ++#undef EXTEND ++#undef POS ++#undef PW ++ ++#undef filter_light1 ++#undef filter_light ++#undef filter_strong ++#undef ref_gen ++ ++#ifndef INCLUDED_ONCE ++#define INCLUDED_ONCE ++#endif ++ +diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c +new file mode 100644 +index 0000000000..98a0b104b7 +--- /dev/null ++++ b/libavcodec/rpi_mailbox.c +@@ -0,0 +1,155 @@ ++/* ++Copyright (c) 2012, Broadcom Europe Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define MAJOR_NUM 100 ++#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *) ++#define DEVICE_FILE_NAME "/dev/vcio" ++ ++#include "rpi_mailbox.h" ++//#include ++ ++/* ++ * use ioctl to send mbox property message ++ */ ++ ++static int mbox_property(int file_desc, void *buf) ++{ ++ int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf); ++ ++ if (ret_val < 0) { ++ printf("ioctl_set_msg failed:%d\n", ret_val); ++ } ++ ++#ifdef DEBUG ++ unsigned *p = buf; int i; unsigned size = *(unsigned *)buf; ++ for (i=0; i ++#include ++#include ++#include ++ ++#include "config.h" ++ ++#include "libavutil/avassert.h" ++#include "libavutil/rpi_sand_fns.h" ++ ++#pragma GCC diagnostic push ++// Many many redundant decls in the header files ++#pragma GCC diagnostic ignored "-Wredundant-decls" ++#include ++#include ++#include ++#pragma GCC diagnostic pop ++ ++#include "rpi_mem.h" ++#include "rpi_zc_frames.h" ++ ++ ++#define OPT_PREFER_CMA 0 ++ ++struct rpi_cache_flush_env_s { ++ struct vcsm_user_clean_invalid2_s v; ++}; ++ ++ ++// GPU memory alloc fns (internal) ++ ++static void gpu_free_internal(GPU_MEM_PTR_T * const p) ++{ ++ if (p->arm != NULL) ++ vcsm_unlock_ptr(p->arm); ++ if (p->vcsm_handle != 0) ++ vcsm_free(p->vcsm_handle); ++ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again ++} ++ ++ ++static int gpu_malloc_internal(GPU_MEM_PTR_T * const p, ++ const int numbytes, const unsigned int cache_type, const char * const name) ++{ ++ memset(p, 0, sizeof(*p)); ++ p->numbytes = (numbytes + 255) & ~255; // Round up ++ ++ if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "Unable to alloc %d bytes from VCSM for %s\n", p->numbytes, name); ++ goto fail; ++ } ++ if ((p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "Unable to VC handle from VCSM for %s\n", name); ++ goto fail; ++ } ++ if ((p->arm = vcsm_lock(p->vcsm_handle)) == NULL) ++ { ++ av_log(NULL, AV_LOG_ERROR, "Unable to lock handle from VCSM for %s\n", name); ++ goto fail; ++ } ++ if ((p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "Unable to get VC addr from VCSM for %s\n", name); ++ goto fail; ++ } ++ ++ return 0; ++ ++fail: ++ gpu_free_internal(p); ++ return AVERROR(ENOMEM); ++} ++ ++// Public gpu fns ++ ++// Allocate memory on GPU ++// Fills in structure

containing ARM pointer, videocore handle, videocore memory address, numbytes ++// Returns 0 on success. ++// This allocates memory that will not be cached in ARM's data cache. ++// Therefore safe to use without data cache flushing. ++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) ++{ ++ return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_NONE, "ffmpeg uncached"); ++} ++ ++// This allocates data that will be ++// Cached in ARM L2 ++// Uncached in VPU L2 ++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) ++{ ++ return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_HOST, "ffmpeg cached"); ++} ++ ++void gpu_free(GPU_MEM_PTR_T * const p) { ++ gpu_free_internal(p); ++} ++ ++void rpi_mem_gpu_uninit(void) ++{ ++ vcsm_exit(); ++ bcm_host_deinit(); ++} ++ ++int rpi_mem_gpu_init(const unsigned int flags) ++{ ++ const int wants_cma = bcm_host_is_fkms_active(); ++ int use_cma; ++ ++ (void)flags; ++ ++ if (vcsm_init_ex(wants_cma ? 1 : 0, -1) == 0) ++ use_cma = 1; ++ else if (vcsm_init_ex(wants_cma ? 0 : 1, -1) == 0) ++ use_cma = 0; ++ else ++ return AVERROR(EINVAL); ++ ++ bcm_host_init(); ++ ++ return use_cma + 1; ++} ++ ++// ---------------------------------------------------------------------------- ++// ++// Cache flush functions ++ ++#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s)) ++ ++rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf) ++{ ++ rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf; ++ *rfe = (rpi_cache_flush_env_t){.v={.op_count = 0}}; ++ return rfe; ++} ++ ++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe) ++{ ++ // Nothing needed ++} ++ ++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe) ++{ ++ int rc = 0; ++ if (rfe->v.op_count != 0) { ++ if (vcsm_clean_invalid2(&rfe->v) != 0) ++ { ++ const int err = errno; ++ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", err); ++ rc = AVERROR(err); ++ } ++ rfe->v.op_count = 0; ++ } ++ return rc; ++} ++ ++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe) ++{ ++ int rc = rpi_cache_flush_execute(rfe);; ++ ++ return rc; ++} ++ ++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride) ++{ ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; ++ ++ av_assert1(rfe->v.op_count <= CACHE_EL_MAX); ++ ++ b->invalidate_mode = mode; ++ b->block_count = blocks; ++ b->start_address = gm->arm + offset0; ++ b->block_size = block_size; ++ b->inter_block_stride = block_stride; ++} ++ ++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset, const unsigned int size) ++{ ++ // Deal with empty pointer trivially ++ if (gm == NULL || size == 0) ++ return; ++ ++ av_assert1(offset <= gm->numbytes); ++ av_assert1(size <= gm->numbytes); ++ av_assert1(offset + size <= gm->numbytes); ++ ++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0); ++} ++ ++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode) ++{ ++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0); ++} ++ ++ ++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode) ++{ ++#if !RPI_ONE_BUF ++#error Fixme! (NIF) ++#endif ++ if (gpu_is_buf1(frame)) { ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode); ++ } ++ else ++ { ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode); ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode); ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode); ++ } ++} ++ ++// Flush an area of a frame ++// Width, height, x0, y0 in luma pels ++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode, ++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, ++ const unsigned int uv_shift, const int do_luma, const int do_chroma) ++{ ++ const unsigned int y_offset = frame->linesize[0] * y0; ++ const unsigned int y_size = frame->linesize[0] * height; ++ // Round UV up/down to get everything ++ const unsigned int uv_rnd = (1U << uv_shift) >> 1; ++ const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift); ++ const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset; ++ ++#if 0 ++ // *** frame->height is cropped height so not good ++ // As all unsigned they will also reject -ve ++ // Test individually as well as added to reject overflow ++ av_assert0(start_line <= (unsigned int)frame->height); // ***** frame height cropped ++ av_assert0(n <= (unsigned int)frame->height); ++ av_assert0(start_line + n <= (unsigned int)frame->height); ++#endif ++ ++ if (!gpu_is_buf1(frame)) ++ { ++ if (do_luma) { ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size); ++ } ++ if (do_chroma) { ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size); ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size); ++ } ++ } ++ else if (!av_rpi_is_sand_frame(frame)) ++ { ++ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); ++ if (do_luma) { ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size); ++ } ++ if (do_chroma) { ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size); ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size); ++ } ++ } ++ else ++ { ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int xshl = av_rpi_sand_frame_xshl(frame); ++ const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1); ++ const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1; // Same for Y & C ++ av_assert1(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX); ++ ++ if (do_chroma) ++ { ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; ++ b->invalidate_mode = mode; ++ b->block_count = block_count; ++ b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1); ++ b->block_size = uv_size; ++ b->inter_block_stride = stride1 * stride2; ++ } ++ if (do_luma) ++ { ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; ++ b->invalidate_mode = mode; ++ b->block_count = block_count; ++ b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0); ++ b->block_size = y_size; ++ b->inter_block_stride = stride1 * stride2; ++ } ++ } ++} ++ ++// Call this to clean and invalidate a region of memory ++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode) ++{ ++ rpi_cache_buf_t cbuf; ++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf); ++ rpi_cache_flush_add_gm_ptr(rfe, p, mode); ++ rpi_cache_flush_finish(rfe); ++} ++ +diff --git a/libavcodec/rpi_mem.h b/libavcodec/rpi_mem.h +new file mode 100644 +index 0000000000..a451079806 +--- /dev/null ++++ b/libavcodec/rpi_mem.h +@@ -0,0 +1,88 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++#ifndef RPI_MEM_H ++#define RPI_MEM_H ++ ++typedef struct gpu_mem_ptr_s { ++ unsigned char *arm; // Pointer to memory mapped on ARM side ++ int vc_handle; // Videocore handle of relocatable memory ++ int vcsm_handle; // Handle for use by VCSM ++ int vc; // Address for use in GPU code ++ int numbytes; // Size of memory block ++} GPU_MEM_PTR_T; ++ ++// General GPU functions ++ ++#define GPU_INIT_GPU 1 ++#define GPU_INIT_CMA 2 ++ ++extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p); ++extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p); ++extern void gpu_free(GPU_MEM_PTR_T * const p); ++int rpi_mem_gpu_init(const unsigned int flags); ++void rpi_mem_gpu_uninit(void); ++ ++// Cache flush stuff ++ ++struct rpi_cache_flush_env_s; ++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t; ++ ++typedef struct {uint32_t t[33];} rpi_cache_buf_t; ++ ++rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf); ++// Free env without flushing ++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe); ++// Do the accumulated flush & clear but do not free the env ++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe); ++// Do the accumulated flush & free the env ++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe); ++ ++typedef enum ++{ ++ RPI_CACHE_FLUSH_MODE_INVALIDATE = 1, ++ RPI_CACHE_FLUSH_MODE_WRITEBACK = 2, ++ RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3 ++} rpi_cache_flush_mode_t; ++ ++struct AVFrame; ++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode); ++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode, ++ const unsigned int offset, const unsigned int size); ++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride); ++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode); ++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode, ++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, ++ const unsigned int uv_shift, const int do_luma, const int do_chroma); ++ ++// init, add, finish for one gm ptr ++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode); ++ ++#endif +diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c +new file mode 100644 +index 0000000000..cb7b96119e +--- /dev/null ++++ b/libavcodec/rpi_qpu.c +@@ -0,0 +1,776 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++ ++#include ++#include ++#include ++#include ++#include ++#include "libavutil/avassert.h" ++ ++#include "config.h" ++ ++#include ++#include ++ ++#include ++ ++#include "rpi_mailbox.h" ++#include "rpi_mem.h" ++#include "rpi_qpu.h" ++#include "rpi_hevc_shader.h" ++#include "rpi_hevc_transform8.h" ++#include "rpi_hevc_transform10.h" ++#include "libavutil/rpi_sand_fns.h" ++ ++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No) ++#define RPI_TRACE_TIME_VPU_QPU_WAIT 0 ++ ++// Add profile flags to all QPU requests - generates output in "vcdbg log msg" ++// Beware this is expensive and will probably throw off all other timing by >10% ++#define RPI_TRACE_QPU_PROFILE_ALL 0 ++ ++// QPU "noflush" flags ++// a mixture of flushing & profiling ++ ++#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed ++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers ++#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results ++#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling ++#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed) ++ ++#define vcos_verify_ge0(x) ((x)>=0) ++ ++// Size in 32bit words ++#define QPU_CODE_SIZE 4098 ++#define VPU_CODE_SIZE 16384 ++ ++static const short rpi_transMatrix2even[32][16] = { // Even rows first ++{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, ++{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90}, ++{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89}, ++{87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87}, ++{83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83}, ++{80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80}, ++{75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75}, ++{70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70}, ++{64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64}, ++{57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57}, ++{50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50}, ++{43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43}, ++{36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36}, ++{25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25}, ++{18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18}, ++{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9}, ++// Odd rows ++{90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4}, ++{90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13}, ++{88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22}, ++{85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31}, ++{82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38}, ++{78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46}, ++{73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54}, ++{67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61}, ++{61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67}, ++{54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73}, ++{46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78}, ++{38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82}, ++{31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85}, ++{22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88}, ++{13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90}, ++{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90} ++}; ++ ++// Code/constants on GPU ++struct GPU ++{ ++// unsigned int qpu_code[QPU_CODE_SIZE]; ++ unsigned int vpu_code8[VPU_CODE_SIZE]; ++ unsigned int vpu_code10[VPU_CODE_SIZE]; ++ short transMatrix2even[16*16*2]; ++}; ++ ++#define WAIT_COUNT_MAX 16 ++ ++typedef struct trace_time_one_s ++{ ++ int count; ++ int64_t start[WAIT_COUNT_MAX]; ++ int64_t total[WAIT_COUNT_MAX]; ++} trace_time_one_t; ++ ++typedef struct trace_time_wait_s ++{ ++ unsigned int jcount; ++ int64_t start0; ++ int64_t last_update; ++ trace_time_one_t active; ++ trace_time_one_t wait; ++} trace_time_wait_t; ++ ++typedef struct vq_wait_s ++{ ++ sem_t sem; ++ struct vq_wait_s * next; ++} vq_wait_t; ++ ++#define VQ_WAIT_POOL_SIZE 16 ++typedef struct vq_wait_pool_s ++{ ++ vq_wait_t * head; ++ vq_wait_t pool[VQ_WAIT_POOL_SIZE]; ++} vq_wait_pool_t; ++ ++static void vq_wait_pool_init(vq_wait_pool_t * const pool); ++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool); ++ ++typedef struct gpu_env_s ++{ ++ int open_count; ++ int init_count; ++ int vpu_i_cache_flushed; ++ GPU_MEM_PTR_T qpu_code_gm_ptr; ++ GPU_MEM_PTR_T code_gm_ptr; ++ GPU_MEM_PTR_T dummy_gm_ptr; ++ vq_wait_pool_t wait_pool; ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ trace_time_wait_t ttw; ++#endif ++} gpu_env_t; ++ ++// Stop more than one thread trying to allocate memory or use the processing resources at once ++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER; ++static gpu_env_t * gpu = NULL; ++ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ ++static int64_t ns_time(void) ++{ ++ struct timespec ts; ++ clock_gettime(CLOCK_MONOTONIC, &ts); ++ return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec; ++} ++ ++ ++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000 ++ ++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U) ++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000) ++#define T_ARG(t) T_SEC(t), T_MS(t) ++#define T_FMT "%u.%03u" ++ ++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix) ++{ ++ // Update totals for levels that are still pending ++ for (int i = 0; i < tto->count; ++i) { ++ tto->total[i] += now - tto->start[i]; ++ tto->start[i] = now; ++ } ++ ++ printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n", ++ prefix, ++ T_ARG(now - start0 - tto->total[0]), ++ T_ARG(tto->total[0]), ++ T_ARG(tto->total[1]), ++ T_ARG(tto->total[2]), ++ T_ARG(tto->total[3])); ++} ++ ++ ++static void tto_start(trace_time_one_t * const tto, const int64_t now) ++{ ++ av_assert0(tto->count < WAIT_COUNT_MAX); ++ tto->start[tto->count++] = now; ++} ++ ++static void tto_end(trace_time_one_t * const tto, const int64_t now) ++{ ++ const int n = --tto->count; ++ av_assert0(n >= 0); ++ tto->total[n] += now - tto->start[n]; ++} ++ ++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now) ++{ ++ printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0)); ++ tto_print(&ttw->active, now, ttw->start0, "Active"); ++ tto_print(&ttw->wait, now, ttw->start0, " Wait"); ++} ++ ++#endif ++ ++// GPU memory alloc fns (internal) ++ ++static void gpu_free_internal(GPU_MEM_PTR_T * const p) ++{ ++ if (p->arm != NULL) ++ vcsm_unlock_ptr(p->arm); ++ if (p->vcsm_handle != 0) ++ vcsm_free(p->vcsm_handle); ++ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again ++} ++ ++ ++static int gpu_malloc_internal(GPU_MEM_PTR_T * const p, ++ const int numbytes, const unsigned int cache_type, const char * const name) ++{ ++ memset(p, 0, sizeof(*p)); ++ p->numbytes = (numbytes + 255) & ~255; // Round up ++ ++ if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0 || ++ (p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0 || ++ (p->arm = vcsm_lock(p->vcsm_handle)) == NULL || ++ (p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0) ++ { ++ gpu_free_internal(p); ++ return AVERROR(ENOMEM); ++ } ++ return 0; ++} ++ ++ ++// GPU init, free, lock, unlock ++ ++static void gpu_term(void) ++{ ++ gpu_env_t * const ge = gpu; ++ ++ // We have to hope that eveything has terminated... ++ gpu = NULL; ++ ++ vc_gpuserv_deinit(); ++ ++ gpu_free_internal(&ge->code_gm_ptr); ++ gpu_free_internal(&ge->qpu_code_gm_ptr); ++ gpu_free_internal(&ge->dummy_gm_ptr); ++ ++ vcsm_exit(); ++ ++ vq_wait_pool_deinit(&ge->wait_pool); ++ ++ free(ge); ++} ++ ++ ++// Connect to QPU, returns 0 on success. ++static int gpu_init(gpu_env_t ** const gpu) { ++ volatile struct GPU* ptr; ++ gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t)); ++ int rv; ++ *gpu = NULL; ++ ++ if (ge == NULL) ++ return -1; ++ ++ vq_wait_pool_init(&ge->wait_pool); ++ ++ vcsm_init(); ++ ++ // Now copy over the QPU code into GPU memory ++ if ((rv = gpu_malloc_internal(&ge->qpu_code_gm_ptr, QPU_CODE_SIZE * 4, VCSM_CACHE_TYPE_NONE, "ffmpeg qpu code")) != 0) ++ return rv; ++ ++ { ++ int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader; ++ av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int)); ++ memcpy(ge->qpu_code_gm_ptr.arm, ff_hevc_rpi_shader, num_bytes); ++ memset(ge->qpu_code_gm_ptr.arm + num_bytes, 0, QPU_CODE_SIZE*4 - num_bytes); ++ } ++ ++ // And the VPU code ++ if ((rv = gpu_malloc_internal(&ge->code_gm_ptr, sizeof(struct GPU), VCSM_CACHE_TYPE_VC, "ffmpeg vpu code")) != 0) ++ return rv; ++ ptr = (volatile struct GPU*)ge->code_gm_ptr.arm; ++ ++ // Zero everything so we have zeros between the code bits ++ memset((void *)ptr, 0, sizeof(*ptr)); ++ { ++ int num_bytes = sizeof(rpi_hevc_transform8); ++ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); ++ memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes); ++ } ++ { ++ int num_bytes = sizeof(rpi_hevc_transform10); ++ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); ++ memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes); ++ } ++ // And the transform coefficients ++ memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even)); ++ ++ // Generate a dummy "frame" & fill with 0x80 ++ // * Could reset to 1 <dummy_gm_ptr, 0x4000, VCSM_CACHE_TYPE_NONE, "ffmpeg dummy frame")) != 0) ++ return rv; ++ memset(ge->dummy_gm_ptr.arm, 0x80, 0x4000); ++ ++ *gpu = ge; ++ return 0; ++} ++ ++ ++ ++static void gpu_unlock(void) { ++ pthread_mutex_unlock(&gpu_mutex); ++} ++ ++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary. ++static gpu_env_t * gpu_lock(void) { ++ pthread_mutex_lock(&gpu_mutex); ++ ++ av_assert1(gpu != NULL); ++ return gpu; ++} ++ ++static gpu_env_t * gpu_lock_ref(void) ++{ ++ pthread_mutex_lock(&gpu_mutex); ++ ++ if (gpu == NULL) { ++ int rv = gpu_init(&gpu); ++ if (rv != 0) { ++ gpu_unlock(); ++ return NULL; ++ } ++ } ++ ++ ++gpu->open_count; ++ return gpu; ++} ++ ++static void gpu_unlock_unref(gpu_env_t * const ge) ++{ ++ if (--ge->open_count == 0) ++ gpu_term(); ++ ++ gpu_unlock(); ++} ++ ++static inline gpu_env_t * gpu_ptr(void) ++{ ++ av_assert1(gpu != NULL); ++ return gpu; ++} ++ ++unsigned int vpu_get_fn(const unsigned int bit_depth) { ++ uint32_t a = 0; ++ ++ // Make sure that the gpu is initialized ++ av_assert1(gpu != NULL); ++ switch (bit_depth){ ++ case 8: ++ a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8); ++ break; ++ case 10: ++ a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10); ++ break; ++ default: ++ av_assert0(0); ++ } ++ return a; ++} ++ ++unsigned int vpu_get_constants(void) { ++ av_assert1(gpu != NULL); ++ return (gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even)); ++} ++ ++void gpu_ref(void) ++{ ++ gpu_lock_ref(); ++ gpu_unlock(); ++} ++ ++void gpu_unref(void) ++{ ++ gpu_env_t * const ge = gpu_lock(); ++ gpu_unlock_unref(ge); ++} ++ ++// ---------------------------------------------------------------------------- ++ ++ ++// Wait abstractions - mostly so we can easily add profile code ++static void vq_wait_pool_init(vq_wait_pool_t * const wp) ++{ ++ unsigned int i; ++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { ++ sem_init(&wp->pool[i].sem, 0, 0); ++ wp->pool[i].next = wp->pool + i + 1; ++ } ++ wp->head = wp->pool + 0; ++ wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL; ++} ++ ++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp) ++{ ++ unsigned int i; ++ wp->head = NULL; ++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { ++ sem_destroy(&wp->pool[i].sem); ++ wp->pool[i].next = NULL; ++ } ++} ++ ++ ++// If sem_init actually takes time then maybe we want a pool... ++static vq_wait_t * vq_wait_new(void) ++{ ++ gpu_env_t * const ge = gpu_lock_ref(); ++ vq_wait_t * const wait = ge->wait_pool.head; ++ ge->wait_pool.head = wait->next; ++ wait->next = NULL; ++ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ tto_start(&ge->ttw.active, ns_time()); ++#endif ++ ++ gpu_unlock(); ++ return wait; ++} ++ ++static void vq_wait_delete(vq_wait_t * const wait) ++{ ++ gpu_env_t * const ge = gpu_lock(); ++ wait->next = ge->wait_pool.head; ++ ge->wait_pool.head = wait; ++ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ { ++ trace_time_wait_t * const ttw = &ge->ttw; ++ const int64_t now = ns_time(); ++ ++ttw->jcount; ++ tto_end(&ttw->wait, now); ++ ++ if (ttw->start0 == 0) ++ { ++ ttw->start0 = ttw->active.start[0]; ++ ttw->last_update = ttw->start0; ++ } ++ if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD) ++ { ++ ttw->last_update += WAIT_TIME_PRINT_PERIOD; ++ ttw_print(ttw, now); ++ } ++ } ++#endif ++ gpu_unlock_unref(ge); ++} ++ ++static void vq_wait_wait(vq_wait_t * const wait) ++{ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ { ++ const int64_t now = ns_time(); ++ gpu_env_t * const ge = gpu_lock(); ++ tto_start(&ge->ttw.wait, now); ++ gpu_unlock(); ++ } ++#endif ++ ++ while (sem_wait(&wait->sem) == -1 && errno == EINTR) ++ /* loop */; ++} ++ ++static void vq_wait_post(vq_wait_t * const wait) ++{ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ { ++ gpu_env_t *const ge = gpu_lock(); ++ tto_end(&ge->ttw.active, ns_time()); ++ gpu_unlock(); ++ } ++#endif ++ ++ sem_post(&wait->sem); ++} ++ ++ ++ ++// Header comments were wrong for these two ++#define VPU_QPU_MASK_QPU 1 ++#define VPU_QPU_MASK_VPU 2 ++ ++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t; ++ ++vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf) ++{ ++// vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t)); ++ vpu_qpu_job_env_t * vqj = buf; ++// memset(vqj, 0, sizeof(*vqj)); ++ vqj->n = 0; ++ vqj->mask = 0; ++ return vqj; ++} ++ ++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj) ++{ ++// memset(vqj, 0, sizeof(*vqj)); ++// free(vqj); ++} ++ ++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj) ++{ ++ struct gpu_job_s * const j = vqj->j + vqj->n++; ++ av_assert1(vqj->n <= VPU_QPU_JOB_MAX); ++ return j; ++} ++ ++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code, ++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5) ++{ ++ if (vpu_code != 0) { ++ struct gpu_job_s *const j = new_job(vqj); ++ vqj->mask |= VPU_QPU_MASK_VPU; ++ ++ j->command = EXECUTE_VPU; ++ j->callback.func = 0; ++ j->callback.cookie = NULL; ++ // The bottom two bits of the execute address contain no-flush flags ++ // b0 will flush the VPU I-cache if unset so we nearly always want that set ++ // as we never reload code ++ j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed; ++ j->u.v.q[1] = r0; ++ j->u.v.q[2] = r1; ++ j->u.v.q[3] = r2; ++ j->u.v.q[4] = r3; ++ j->u.v.q[5] = r4; ++ j->u.v.q[6] = r5; ++ gpu->vpu_i_cache_flushed = 1; ++ } ++} ++ ++// flags are QPU_FLAGS_xxx ++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail) ++{ ++ if (n != 0) { ++ struct gpu_job_s *const j = new_job(vqj); ++ vqj->mask |= VPU_QPU_MASK_QPU; ++ ++ j->command = EXECUTE_QPU; ++ j->callback.func = 0; ++ j->callback.cookie = NULL; ++ ++ j->u.q.jobs = n; ++#if RPI_TRACE_QPU_PROFILE_ALL ++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS; ++#else ++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU; ++#endif ++ j->u.q.timeout = 5000; ++ memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); ++ } ++} ++ ++// Convert callback to sem post ++static void vpu_qpu_job_callback_wait(void * v) ++{ ++ vq_wait_post(v); ++} ++ ++// Poke a user-supplied sem ++static void vpu_qpu_job_callback_sem(void * v) ++{ ++ sem_post((sem_t *)v); ++} ++ ++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h) ++{ ++ vq_wait_t * wait; ++ ++ if (vqj->mask == 0) { ++ *wait_h = NULL; ++ return; ++ } ++ ++ // We are going to want a sync object ++ wait = vq_wait_new(); ++ ++ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync ++ // If we only posted one thing or only QPU jobs ++ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU) ++ { ++ struct gpu_job_s * const j = vqj->j + (vqj->n - 1); ++ av_assert1(j->callback.func == 0); ++ ++ j->callback.func = vpu_qpu_job_callback_wait; ++ j->callback.cookie = wait; ++ } ++ else ++ { ++ struct gpu_job_s *const j = new_job(vqj); ++ ++ j->command = EXECUTE_SYNC; ++ j->u.s.mask = vqj->mask; ++ j->callback.func = vpu_qpu_job_callback_wait; ++ j->callback.cookie = wait; ++ } ++ ++ vqj->mask = 0; ++ *wait_h = wait; ++} ++ ++// Returns 0 if no sync added ('cos Q empty), 1 if sync added ++int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem) ++{ ++ // If nothing on q then just return ++ if (vqj->mask == 0) ++ return 0; ++ ++ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync ++ // If we only posted one thing or only QPU jobs ++ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU) ++ { ++ struct gpu_job_s * const j = vqj->j + (vqj->n - 1); ++ av_assert1(j->callback.func == 0); ++ ++ j->callback.func = vpu_qpu_job_callback_sem; ++ j->callback.cookie = sem; ++ } ++ else ++ { ++ struct gpu_job_s *const j = new_job(vqj); ++ ++ j->command = EXECUTE_SYNC; ++ j->u.s.mask = vqj->mask; ++ j->callback.func = vpu_qpu_job_callback_sem; ++ j->callback.cookie = sem; ++ } ++ ++ vqj->mask = 0; ++ return 1; ++} ++ ++ ++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj) ++{ ++ if (vqj->n == 0) ++ return 0; ++ ++ return vc_gpuserv_execute_code(vqj->n, vqj->j); ++} ++ ++// Simple wrapper of start + delete ++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj) ++{ ++ int rv; ++ rv = vpu_qpu_job_start(vqj); ++ vpu_qpu_job_delete(vqj); ++ return rv; ++} ++ ++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h) ++{ ++ if (wait_h != NULL) ++ { ++ vq_wait_t * const wait = *wait_h; ++ if (wait != NULL) { ++ *wait_h = NULL; ++ vq_wait_wait(wait); ++ vq_wait_delete(wait); ++ } ++ } ++} ++ ++int vpu_qpu_init() ++{ ++ gpu_env_t * const ge = gpu_lock_ref(); ++ if (ge == NULL) ++ return -1; ++ ++ if (ge->init_count++ == 0) ++ { ++ vc_gpuserv_init(); ++ } ++ ++ gpu_unlock(); ++ return 0; ++} ++ ++void vpu_qpu_term() ++{ ++ gpu_env_t * const ge = gpu_lock(); ++ ++ if (--ge->init_count == 0) { ++ vc_gpuserv_deinit(); ++ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ ttw_print(&ge->ttw, ns_time()); ++#endif ++ } ++ ++ gpu_unlock_unref(ge); ++} ++ ++uint32_t qpu_fn(const int * const mc_fn) ++{ ++ return gpu->qpu_code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader); ++} ++ ++uint32_t qpu_dummy(void) ++{ ++ return gpu->dummy_gm_ptr.vc; ++} ++ ++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth) ++{ ++ // Dummy values we can catch with emulation ++ qf->y_pxx = ~1U; ++ qf->y_bxx = ~2U; ++ qf->y_p00 = ~3U; ++ qf->y_b00 = ~4U; ++ qf->c_pxx = ~5U; ++ qf->c_bxx = ~6U; ++ ++ switch (bit_depth) { ++ case 8: ++ qf->y_pxx = qpu_fn(mc_filter_y_pxx); ++ qf->y_pxx = qpu_fn(mc_filter_y_pxx); ++ qf->y_bxx = qpu_fn(mc_filter_y_bxx); ++ qf->y_p00 = qpu_fn(mc_filter_y_p00); ++ qf->y_b00 = qpu_fn(mc_filter_y_b00); ++ qf->c_pxx = qpu_fn(mc_filter_c_p); ++ qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1); ++ qf->c_bxx = qpu_fn(mc_filter_c_b); ++ break; ++ case 10: ++ qf->c_pxx = qpu_fn(mc_filter_c10_p); ++ qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1); ++ qf->c_bxx = qpu_fn(mc_filter_c10_b); ++ qf->y_pxx = qpu_fn(mc_filter_y10_pxx); ++ qf->y_bxx = qpu_fn(mc_filter_y10_bxx); ++ qf->y_p00 = qpu_fn(mc_filter_y10_p00); ++ qf->y_b00 = qpu_fn(mc_filter_y10_b00); ++ break; ++ default: ++ return -1; ++ } ++ return 0; ++} ++ +diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h +new file mode 100644 +index 0000000000..8777687021 +--- /dev/null ++++ b/libavcodec/rpi_qpu.h +@@ -0,0 +1,103 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++#ifndef RPI_QPU_H ++#define RPI_QPU_H ++ ++#include "rpi_mem.h" ++#include "rpi_zc_frames.h" ++ ++#pragma GCC diagnostic push ++// Many many redundant decls in the header files ++#pragma GCC diagnostic ignored "-Wredundant-decls" ++#pragma GCC diagnostic ignored "-Wstrict-prototypes" ++#include "interface/vmcs_host/vc_vchi_gpuserv.h" // for gpu_job_s ++#pragma GCC diagnostic pop ++ ++// QPU specific functions ++ ++typedef struct HEVCRpiQpu { ++ uint32_t c_pxx; ++ uint32_t c_pxx_l1; ++ uint32_t c_bxx; ++ uint32_t y_pxx; ++ uint32_t y_bxx; ++ uint32_t y_p00; ++ uint32_t y_b00; ++} HEVCRpiQpu; ++ ++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth); ++ ++uint32_t qpu_fn(const int * const mc_fn); ++uint32_t qpu_dummy(void); ++ ++#define QPU_N_GRP 4 ++#define QPU_N_MAX 12 ++ ++#define QPU_MAIL_EL_VALS 2 ++ ++struct vpu_qpu_wait_s; ++typedef struct vq_wait_s * vpu_qpu_wait_h; ++ ++// VPU specific functions ++ ++struct vpu_qpu_job_env_s; ++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h; ++ ++#define VPU_QPU_JOB_MAX 4 ++struct vpu_qpu_job_env_s ++{ ++ unsigned int n; ++ unsigned int mask; ++ struct gpu_job_s j[VPU_QPU_JOB_MAX]; ++}; ++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t; ++ ++vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf); ++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj); ++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code, ++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5); ++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail); ++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h); ++int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem); ++int vpu_qpu_job_start(const vpu_qpu_job_h vqj); ++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj); ++ ++extern unsigned int vpu_get_fn(const unsigned int bit_depth); ++extern unsigned int vpu_get_constants(void); ++ ++// Waits for previous post_codee to complete and Will null out *wait_h after use ++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h); ++int vpu_qpu_init(void); ++void vpu_qpu_term(void); ++ ++void gpu_ref(void); ++void gpu_unref(void); ++ ++#endif +diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c +new file mode 100644 +index 0000000000..37be9a0f49 +--- /dev/null ++++ b/libavcodec/rpi_zc.c +@@ -0,0 +1,1227 @@ ++#include "config.h" ++ ++#include "libavcodec/avcodec.h" ++#include "rpi_mem.h" ++#include "rpi_mailbox.h" ++#include "rpi_zc.h" ++#include "libavutil/avassert.h" ++#include ++ ++#include "libavutil/buffer_internal.h" ++ ++#pragma GCC diagnostic push ++// Many many redundant decls in the header files ++#pragma GCC diagnostic ignored "-Wredundant-decls" ++#include ++#include ++#pragma GCC diagnostic pop ++ ++#define TRACE_ALLOC 0 ++#define DEBUG_ALWAYS_KEEP_LOCKED 0 ++ ++struct ZcPoolEnt; ++ ++typedef struct ZcPool ++{ ++ size_t numbytes; ++ struct ZcPoolEnt * head; ++ pthread_mutex_t lock; ++} ZcPool; ++ ++typedef struct ZcPoolEnt ++{ ++ size_t numbytes; ++ ++ unsigned int vcsm_handle; ++ unsigned int vc_handle; ++ void * map_arm; ++ unsigned int map_vc; ++ ++ struct ZcPoolEnt * next; ++ struct ZcPool * pool; ++} ZcPoolEnt; ++ ++typedef struct ZcOldCtxVals ++{ ++ int thread_safe_callbacks; ++ int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags); ++ void * opaque; ++} ZcOldCtxVals; ++ ++typedef struct AVZcEnv ++{ ++ unsigned int refcount; ++ ZcOldCtxVals old; ++ ++ void * pool_env; ++ av_rpi_zc_alloc_buf_fn_t * alloc_buf; ++ av_rpi_zc_free_pool_fn_t * free_pool; ++ ++ unsigned int pool_size; ++} ZcEnv; ++ ++typedef struct ZcUserBufEnv { ++ void * v; ++ const av_rpi_zc_buf_fn_tab_t * fn; ++ size_t numbytes; ++ int offset; ++} ZcUserBufEnv; ++ ++#define ZC_BUF_INVALID 0 ++#define ZC_BUF_VALID 1 ++#define ZC_BUF_NEVER 2 ++ ++typedef struct ZcBufEnv { ++ GPU_MEM_PTR_T gmem; ++ AVZcEnvPtr zc; ++ int is_valid; ++ AVBufferRef * user; ++ AVRpiZcFrameGeometry geo; ++ size_t size_y; ++ size_t size_c; ++ size_t size_pic; ++ ssize_t offset; ++ pthread_mutex_t lock; ++ pthread_cond_t cond; ++} ZcBufEnv; ++ ++ ++ ++ ++ ++ ++#define ALLOC_PAD 0 ++#define ALLOC_ROUND 0x1000 ++#define STRIDE_ROUND 64 ++#define STRIDE_OR 0 ++ ++#define DEBUG_ZAP0_BUFFERS 0 ++ ++static inline int av_rpi_is_sand_format(const int format) ++{ ++ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16) || ++ (format == AV_PIX_FMT_RPI4_8 || format == AV_PIX_FMT_RPI4_10); ++} ++ ++static inline int av_rpi_is_sand_frame(const AVFrame * const frame) ++{ ++ return av_rpi_is_sand_format(frame->format); ++} ++ ++//---------------------------------------------------------------------------- ++// ++// Internal pool stuff ++ ++// Pool entry functions ++ ++static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const size_t req_size) ++{ ++ ZcPoolEnt * const zp = av_mallocz(sizeof(ZcPoolEnt)); ++ ++ // Round up to 4k & add 4k ++ const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1); ++ ++ if (zp == NULL) { ++ av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n"); ++ goto fail0; ++ } ++ ++ // The 0x80 here maps all pages here rather than waiting for lazy mapping ++ // BEWARE that in GPU land a later unlock/lock pair will put us back into ++ // lazy mode - which will also break cache invalidate calls. ++ if ((zp->vcsm_handle = vcsm_malloc_cache(alloc_size, VCSM_CACHE_TYPE_HOST | 0x80, "ffmpeg_rpi_zc")) == 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size); ++ goto fail1; ++ } ++ ++#if TRACE_ALLOC ++ printf("%s: Alloc %#x bytes @ h=%d\n", __func__, alloc_size, zp->vcsm_handle); ++#endif ++ ++ zp->numbytes = alloc_size; ++ zp->pool = pool; ++ return zp; ++ ++fail1: ++ av_free(zp); ++fail0: ++ return NULL; ++} ++ ++static void zc_pool_ent_free(ZcPoolEnt * const zp) ++{ ++#if TRACE_ALLOC ++ printf("%s: Free %#x bytes @ h=%d\n", __func__, zp->numbytes, zp->vcsm_handle); ++#endif ++ ++ if (zp->vcsm_handle != 0) ++ { ++ // VC addr & handle need no dealloc ++ if (zp->map_arm != NULL) ++ vcsm_unlock_hdl(zp->vcsm_handle); ++ vcsm_free(zp->vcsm_handle); ++ } ++ av_free(zp); ++} ++ ++//---------------------------------------------------------------------------- ++// ++// Pool functions ++ ++static void zc_pool_free_ent_list(ZcPoolEnt * p) ++{ ++ while (p != NULL) ++ { ++ ZcPoolEnt * const zp = p; ++ p = p->next; ++ zc_pool_ent_free(zp); ++ } ++} ++ ++static void zc_pool_flush(ZcPool * const pool) ++{ ++ ZcPoolEnt * p = pool->head; ++ pool->head = NULL; ++ pool->numbytes = ~0U; ++ zc_pool_free_ent_list(p); ++} ++ ++static ZcPoolEnt * zc_pool_get_ent(ZcPool * const pool, const size_t req_bytes) ++{ ++ ZcPoolEnt * zp = NULL; ++ ZcPoolEnt * flush_list = NULL; ++ size_t numbytes; ++ ++ pthread_mutex_lock(&pool->lock); ++ ++ numbytes = pool->numbytes; ++ ++ // If size isn't close then dump the pool ++ // Close in this context means within 128k ++ if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes) ++ { ++ flush_list = pool->head; ++ pool->head = NULL; ++ pool->numbytes = numbytes = req_bytes; ++ } ++ else if (pool->head != NULL) ++ { ++ zp = pool->head; ++ pool->head = zp->next; ++ } ++ ++ pthread_mutex_unlock(&pool->lock); ++ ++ zc_pool_free_ent_list(flush_list); ++ ++ if (zp == NULL) ++ zp = zc_pool_ent_alloc(pool, numbytes); ++ ++ return zp; ++} ++ ++static void zc_pool_put_ent(ZcPoolEnt * const zp) ++{ ++ ZcPool * const pool = zp == NULL ? NULL : zp->pool; ++ if (zp != NULL) ++ { ++ pthread_mutex_lock(&pool->lock); ++#if TRACE_ALLOC ++ printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->numbytes); ++#endif ++ ++ if (pool->numbytes == zp->numbytes) ++ { ++ zp->next = pool->head; ++ pool->head = zp; ++ pthread_mutex_unlock(&pool->lock); ++ } ++ else ++ { ++ pthread_mutex_unlock(&pool->lock); ++ zc_pool_ent_free(zp); ++ } ++ } ++} ++ ++static ZcPool * ++zc_pool_new(void) ++{ ++ ZcPool * const pool = av_mallocz(sizeof(*pool)); ++ if (pool == NULL) ++ return NULL; ++ ++ pool->numbytes = -1; ++ pool->head = NULL; ++ pthread_mutex_init(&pool->lock, NULL); ++ return pool; ++} ++ ++static void ++zc_pool_delete(ZcPool * const pool) ++{ ++ if (pool != NULL) ++ { ++ pool->numbytes = -1; ++ zc_pool_flush(pool); ++ pthread_mutex_destroy(&pool->lock); ++ av_free(pool); ++ } ++} ++ ++//============================================================================ ++// ++// ZC implementation using above pool implementation ++// ++// Fn table fns... ++ ++static void zc_pool_free_v(void * v) ++{ ++ zc_pool_put_ent(v); ++} ++ ++static unsigned int zc_pool_ent_vcsm_handle_v(void * v) ++{ ++ ZcPoolEnt * zp = v; ++ return zp->vcsm_handle; ++} ++ ++static unsigned int zc_pool_ent_vc_handle_v(void * v) ++{ ++ ZcPoolEnt * zp = v; ++ if (zp->vc_handle == 0) ++ { ++ if ((zp->vc_handle = vcsm_vc_hdl_from_hdl(zp->vcsm_handle)) == 0) ++ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC handle\n", ++ __func__, zp->vcsm_handle); ++ } ++ return zp->vc_handle; ++} ++ ++static void * zc_pool_ent_map_arm_v(void * v) ++{ ++ ZcPoolEnt * zp = v; ++ if (zp->map_arm == NULL) ++ { ++ if ((zp->map_arm = vcsm_lock(zp->vcsm_handle)) == NULL) ++ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to ARM address\n", ++ __func__, zp->vcsm_handle); ++ } ++ return zp->map_arm; ++} ++ ++static unsigned int zc_pool_ent_map_vc_v(void * v) ++{ ++ ZcPoolEnt * zp = v; ++ if (zp->map_vc == 0) ++ { ++ if ((zp->map_vc = vcsm_vc_addr_from_hdl(zp->vcsm_handle)) == 0) ++ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC address\n", ++ __func__, zp->vcsm_handle); ++ } ++ return zp->map_vc; ++} ++ ++static const av_rpi_zc_buf_fn_tab_t zc_pool_buf_fns = { ++ .free = zc_pool_free_v, ++ .vcsm_handle = zc_pool_ent_vcsm_handle_v, ++ .vc_handle = zc_pool_ent_vc_handle_v, ++ .map_arm = zc_pool_ent_map_arm_v, ++ .map_vc = zc_pool_ent_map_vc_v, ++}; ++ ++// ZC Env fns ++ ++// Delete pool ++// All buffers guaranteed freed by now ++static void ++zc_pool_delete_v(void * v) ++{ ++ zc_pool_delete((ZcPool *)v); ++ rpi_mem_gpu_uninit(); ++} ++ ++// Allocate a new ZC buffer ++static AVBufferRef * ++zc_pool_buf_alloc(void * v, size_t size, const AVRpiZcFrameGeometry * geo) ++{ ++ ZcPool * const pool = v; ++ ZcPoolEnt *const zp = zc_pool_get_ent(pool, size); ++ AVBufferRef * buf; ++ ++ (void)geo; // geo ignored here ++ ++ if (zp == NULL) { ++ av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size); ++ goto fail0; ++ } ++ ++ if ((buf = av_rpi_zc_buf(size, 0, zp, &zc_pool_buf_fns)) == NULL) ++ { ++ av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_buf() failed\n"); ++ goto fail2; ++ } ++ ++ return buf; ++ ++fail2: ++ zc_pool_put_ent(zp); ++fail0: ++ return NULL; ++} ++ ++// Init wrappers - the public fns ++ ++AVZcEnvPtr ++av_rpi_zc_int_env_alloc(void * logctx) ++{ ++ ZcEnv * zc; ++ ZcPool * pool_env; ++ ++ if (rpi_mem_gpu_init(0) < 0) ++ return NULL; ++ ++ if ((pool_env = zc_pool_new()) == NULL) ++ goto fail1; ++ ++ if ((zc = av_rpi_zc_env_alloc(logctx, pool_env, zc_pool_buf_alloc, zc_pool_delete_v)) == NULL) ++ goto fail2; ++ ++ return zc; ++ ++fail2: ++ zc_pool_delete(pool_env); ++fail1: ++ rpi_mem_gpu_uninit(); ++ return NULL; ++} ++ ++void ++av_rpi_zc_int_env_freep(AVZcEnvPtr * zcp) ++{ ++ const AVZcEnvPtr zc = *zcp; ++ *zcp = NULL; ++ if (zc != NULL) ++ av_rpi_zc_env_release(zc); ++} ++ ++//============================================================================ ++// ++// Geometry ++// ++// This is a separate chunck to the rest ++ ++// Get mailbox fd - should be in a lock when called ++// Rely on process close to close it ++static int mbox_fd(void) ++{ ++ static int fd = -1; ++ if (fd != -1) ++ return fd; ++ return (fd = mbox_open()); ++} ++ ++AVRpiZcFrameGeometry av_rpi_zc_frame_geometry( ++ const int format, const unsigned int video_width, const unsigned int video_height) ++{ ++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; ++ ++ AVRpiZcFrameGeometry geo = { ++ .format = format, ++ .video_width = video_width, ++ .video_height = video_height ++ }; ++ ++ switch (format) ++ { ++ case AV_PIX_FMT_YUV420P: ++ geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; ++ geo.stride_c = geo.stride_y / 2; ++ geo.height_y = (video_height + 32 + 31) & ~31; ++ geo.height_c = geo.height_y / 2; ++ geo.planes_c = 2; ++ geo.stripes = 1; ++ geo.bytes_per_pel = 1; ++ geo.stripe_is_yc = 1; ++ break; ++ ++ case AV_PIX_FMT_YUV420P10: ++ geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; ++ geo.stride_c = geo.stride_y / 2; ++ geo.height_y = (video_height + 32 + 31) & ~31; ++ geo.height_c = geo.height_y / 2; ++ geo.planes_c = 2; ++ geo.stripes = 1; ++ geo.bytes_per_pel = 2; ++ geo.stripe_is_yc = 1; ++ break; ++ ++ case AV_PIX_FMT_SAND128: ++ case AV_PIX_FMT_RPI4_8: ++ { ++ const unsigned int stripe_w = 128; ++ ++ static VC_IMAGE_T img = {0}; ++ ++ // Given the overhead of calling the mailbox keep a stashed ++ // copy as we will almost certainly just want the same numbers again ++ // but that means we need a lock ++ pthread_mutex_lock(&sand_lock); ++ ++ if (img.width != video_width || img.height != video_height) ++ { ++ VC_IMAGE_T new_img = { ++ .type = VC_IMAGE_YUV_UV, ++ .width = video_width, ++ .height = video_height ++ }; ++ ++ mbox_get_image_params(mbox_fd(), &new_img); ++ img = new_img; ++ } ++ ++ geo.stride_y = stripe_w; ++ geo.stride_c = stripe_w; ++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; ++ geo.height_c = img.pitch / stripe_w - geo.height_y; ++ geo.stripe_is_yc = 1; ++ if (geo.height_y * stripe_w > img.pitch) ++ { ++ // "tall" sand - all C blocks now follow Y ++ geo.height_y = img.pitch / stripe_w; ++ geo.height_c = geo.height_y; ++ geo.stripe_is_yc = 0; ++ } ++ geo.planes_c = 1; ++ geo.stripes = (video_width + stripe_w - 1) / stripe_w; ++ geo.bytes_per_pel = 1; ++ ++ pthread_mutex_unlock(&sand_lock); ++#if 0 ++ printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n", ++ video_width, video_height, ++ geo.stride_y, geo.stride_c, ++ geo.height_y, geo.height_c, ++ geo.stripes, img.pitch); ++#endif ++ av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0); ++ av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2); ++ break; ++ } ++ ++ case AV_PIX_FMT_RPI4_10: ++ { ++ const unsigned int stripe_w = 128; // bytes ++ ++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; ++ static VC_IMAGE_T img = {0}; ++ ++ // Given the overhead of calling the mailbox keep a stashed ++ // copy as we will almost certainly just want the same numbers again ++ // but that means we need a lock ++ pthread_mutex_lock(&sand_lock); ++ ++ if (img.width != video_width || img.height != video_height) ++ { ++ VC_IMAGE_T new_img = { ++ .type = VC_IMAGE_YUV10COL, ++ .width = video_width, ++ .height = video_height ++ }; ++ ++ mbox_get_image_params(mbox_fd(), &new_img); ++ img = new_img; ++ } ++ ++ geo.stride_y = stripe_w; ++ geo.stride_c = stripe_w; ++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; ++ geo.height_c = img.pitch / stripe_w - geo.height_y; ++ geo.planes_c = 1; ++ geo.stripes = ((video_width * 4 + 2) / 3 + stripe_w - 1) / stripe_w; ++ geo.bytes_per_pel = 1; ++ geo.stripe_is_yc = 1; ++ ++ pthread_mutex_unlock(&sand_lock); ++ ++#if 0 ++ printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n", ++ video_width, video_height, ++ geo.stride_y, geo.stride_c, ++ geo.height_y, geo.height_c, ++ geo.stripes, img.pitch); ++#endif ++ av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0); ++ av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2); ++ break; ++ } ++ ++ case AV_PIX_FMT_SAND64_16: ++ case AV_PIX_FMT_SAND64_10: ++ { ++ const unsigned int stripe_w = 128; // bytes ++ ++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; ++ static VC_IMAGE_T img = {0}; ++ ++ // Given the overhead of calling the mailbox keep a stashed ++ // copy as we will almost certainly just want the same numbers again ++ // but that means we need a lock ++ pthread_mutex_lock(&sand_lock); ++ ++ if (img.width != video_width || img.height != video_height) ++ { ++ VC_IMAGE_T new_img = { ++ .type = VC_IMAGE_YUV_UV_16, ++ .width = video_width, ++ .height = video_height ++ }; ++ ++ mbox_get_image_params(mbox_fd(), &new_img); ++ img = new_img; ++ } ++ ++ geo.stride_y = stripe_w; ++ geo.stride_c = stripe_w; ++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; ++ geo.height_c = img.pitch / stripe_w - geo.height_y; ++ geo.planes_c = 1; ++ geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w; ++ geo.bytes_per_pel = 2; ++ geo.stripe_is_yc = 1; ++ ++ pthread_mutex_unlock(&sand_lock); ++ break; ++ } ++ ++ default: ++ break; ++ } ++ return geo; ++} ++ ++//============================================================================ ++// ++// ZC Env fns ++// ++// Frame copy fns ++ ++static AVBufferRef * zc_copy(const AVZcEnvPtr zc, ++ const AVFrame * const src) ++{ ++ AVFrame dest_frame; ++ AVFrame * const dest = &dest_frame; ++ unsigned int i; ++ uint8_t * psrc, * pdest; ++ ++ dest->format = src->format; ++ dest->width = src->width; ++ dest->height = src->height; ++ ++ if (av_rpi_zc_get_buffer(zc, dest) != 0 || ++ av_rpi_zc_resolve_frame(dest, ZC_RESOLVE_ALLOC_VALID) != 0) ++ { ++ return NULL; ++ } ++ ++ for (i = 0, psrc = src->data[0], pdest = dest->data[0]; ++ i != dest->height; ++ ++i, psrc += src->linesize[0], pdest += dest->linesize[0]) ++ { ++ memcpy(pdest, psrc, dest->width); ++ } ++ for (i = 0, psrc = src->data[1], pdest = dest->data[1]; ++ i != dest->height / 2; ++ ++i, psrc += src->linesize[1], pdest += dest->linesize[1]) ++ { ++ memcpy(pdest, psrc, dest->width / 2); ++ } ++ for (i = 0, psrc = src->data[2], pdest = dest->data[2]; ++ i != dest->height / 2; ++ ++i, psrc += src->linesize[2], pdest += dest->linesize[2]) ++ { ++ memcpy(pdest, psrc, dest->width / 2); ++ } ++ ++ return dest->buf[0]; ++} ++ ++ ++static AVBufferRef * zc_420p10_to_sand128(const AVZcEnvPtr zc, ++ const AVFrame * const src) ++{ ++ assert(0); ++ return NULL; ++} ++ ++ ++static AVBufferRef * zc_sand64_16_to_sand128(const AVZcEnvPtr zc, ++ const AVFrame * const src, const unsigned int src_bits) ++{ ++ assert(0); ++ return NULL; ++} ++ ++//---------------------------------------------------------------------------- ++// ++// Public info extraction calls ++ ++static void zc_buf_env_free_cb(void * opaque, uint8_t * data); ++ ++static inline ZcBufEnv * pic_zbe_ptr(AVBufferRef *const buf) ++{ ++ // Kludge where we check the free fn to check this is really ++ // one of our buffers - can't think of a better way ++ return buf == NULL || buf->buffer->free != zc_buf_env_free_cb ? NULL : ++ av_buffer_get_opaque(buf); ++} ++ ++static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf) ++{ ++ // As gmem is the first el NULL should be preserved ++ return &pic_zbe_ptr(buf)->gmem; ++} ++ ++unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref) ++{ ++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); ++ return p == NULL ? 0 : p->vcsm_handle; ++} ++ ++int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref) ++{ ++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); ++ return p == NULL ? -1 : p->vc_handle; ++} ++ ++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref) ++{ ++ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref); ++ return zbe == NULL ? 0 : zbe->offset; ++} ++ ++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref) ++{ ++ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref); ++ return zbe == NULL ? 0 : zbe->size_pic; ++} ++ ++int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref) ++{ ++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); ++ return p == NULL ? 0 : p->numbytes; ++} ++ ++const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref) ++{ ++ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref); ++ return zbe == NULL ? NULL : &zbe->geo; ++} ++ ++AVRpiZcRefPtr av_rpi_zc_ref(void * const logctx, const AVZcEnvPtr zc, ++ const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy) ++{ ++ av_assert0(!maycopy || zc != NULL); ++ ++ if (frame->format != AV_PIX_FMT_YUV420P && ++ frame->format != AV_PIX_FMT_YUV420P10 && ++ !av_rpi_is_sand_frame(frame)) ++ { ++ av_log(logctx, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format); ++ return NULL; ++ } ++ ++ if (frame->buf[1] != NULL || frame->format != expected_format) ++ { ++#if RPI_ZC_SAND_8_IN_10_BUF ++ if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL) ++ { ++// av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__); ++ return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]); ++ } ++#endif ++ ++ if (maycopy) ++ { ++ if (frame->buf[1] != NULL) ++ av_log(logctx, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__); ++ else ++ av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format); ++ ++ switch (frame->format) ++ { ++ case AV_PIX_FMT_YUV420P10: ++ return zc_420p10_to_sand128(zc, frame); ++ ++ case AV_PIX_FMT_SAND64_10: ++ return zc_sand64_16_to_sand128(zc, frame, 10); ++ ++ default: ++ return zc_copy(zc, frame); ++ } ++ } ++ else ++ { ++ if (frame->buf[1] != NULL) ++ av_log(logctx, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__); ++ else ++ av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format); ++ return NULL; ++ } ++ } ++ ++ if (pic_gm_ptr(frame->buf[0]) == NULL) ++ { ++ if (maycopy) ++ { ++ av_log(logctx, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__); ++ return zc_copy(zc, frame); ++ } ++ else ++ { ++ av_log(logctx, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__); ++ return NULL; ++ } ++ } ++ ++ return av_buffer_ref(frame->buf[0]); ++} ++ ++void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref) ++{ ++ if (fr_ref != NULL) ++ { ++ av_buffer_unref(&fr_ref); ++ } ++} ++ ++//---------------------------------------------------------------------------- ++ ++// Extract user environment from an AVBufferRef ++void * av_rpi_zc_buf_v(AVBufferRef * const buf) ++{ ++ ZcBufEnv * const zbe = pic_zbe_ptr(buf); ++ if (zbe != NULL && zbe->user != NULL) ++ { ++ const ZcUserBufEnv * const zub = (const ZcUserBufEnv *)zbe->user->data; ++ return zub == NULL ? NULL : zub->v; ++ } ++ return NULL; ++} ++ ++// AV buffer pre-free callback ++static void zc_user_buf_free_cb(void * opaque, uint8_t * data) ++{ ++ if (opaque != NULL) ++ { ++ ZcUserBufEnv * const zub = opaque; ++ ++ if (zub->fn->free) ++ zub->fn->free(zub->v); ++ ++ av_free(zub); ++ } ++} ++ ++static void zc_buf_env_free_cb(void * opaque, uint8_t * data) ++{ ++ if (opaque != NULL) ++ { ++ ZcBufEnv * const zbe = opaque; ++ ++ av_buffer_unref(&zbe->user); ++ ++ if (zbe->zc != NULL) ++ av_rpi_zc_env_release(zbe->zc); ++ ++ pthread_cond_destroy(&zbe->cond); ++ pthread_mutex_destroy(&zbe->lock); ++ av_free(zbe); ++ } ++} ++ ++ ++// Wrap the various ZC bits in an AV Buffer and resolve those things we want ++// resolved now. ++// Currently we resolve everything, but in future we might not ++AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab) ++{ ++ AVBufferRef *buf; ++ ZcUserBufEnv * zub; ++ ++ if ((zub = av_malloc(sizeof(ZcUserBufEnv))) == NULL) ++ return NULL; ++ ++ zub->fn = fn_tab; ++ zub->v = v; ++ zub->numbytes = numbytes; ++ zub->offset = addr_offset; ++ ++ if ((buf = av_buffer_create((uint8_t*)zub, sizeof(*zub), zc_user_buf_free_cb, zub, 0)) == NULL) ++ { ++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed av_buffer_create\n"); ++ av_free(zub); ++ return NULL; ++ } ++ ++ return buf; ++} ++ ++int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int alloc_mode) ++{ ++ ZcBufEnv * const zbe = pic_zbe_ptr(buf); ++ ++ if (zbe == NULL) ++ return AVERROR(EINVAL); ++ ++ if (alloc_mode == ZC_RESOLVE_FAIL && !zbe->is_valid) ++ return AVERROR(EAGAIN); ++ ++ if (alloc_mode == ZC_RESOLVE_WAIT_VALID && !zbe->is_valid) ++ { ++ pthread_mutex_lock(&zbe->lock); ++ while (!zbe->is_valid) ++ pthread_cond_wait(&zbe->cond, &zbe->lock); ++ pthread_mutex_unlock(&zbe->lock); ++ } ++ ++ if (zbe->is_valid == ZC_BUF_NEVER) ++ return AVERROR(EINVAL); ++ ++ // Do alloc if we need it ++ if (zbe->user == NULL) ++ { ++ ZcEnv * const zc = zbe->zc; ++ const ZcUserBufEnv * zub; ++ ++ av_assert0(alloc_mode == ZC_RESOLVE_ALLOC || alloc_mode == ZC_RESOLVE_ALLOC_VALID); ++ ++ if ((zbe->user = zc->alloc_buf(zc->pool_env, zbe->size_pic, &zbe->geo)) == NULL) ++ { ++ av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n"); ++ goto fail; ++ } ++ zub = (const ZcUserBufEnv *)zbe->user->data; ++ ++ // Track ++ ++ zbe->offset = zub->offset; ++ zbe->gmem.numbytes = zub->numbytes; ++ if ((zbe->gmem.arm = zub->fn->map_arm(zub->v)) == NULL) ++ { ++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to lock vcsm_handle %u\n", zbe->gmem.vcsm_handle); ++ goto fail; ++ } ++ ++ if ((zbe->gmem.vcsm_handle = zub->fn->vcsm_handle(zub->v)) == 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vcsm_handle\n"); ++ goto fail; ++ } ++ ++ if ((zbe->gmem.vc_handle = zub->fn->vc_handle(zub->v)) == 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc handle from vcsm_handle %u\n", zbe->gmem.vcsm_handle); ++ goto fail; ++ } ++ if ((zbe->gmem.vc = zub->fn->map_vc(zub->v)) == 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc addr from vcsm_handle %u\n", zbe->gmem.vcsm_handle); ++ goto fail; ++ } ++ ++ buf->buffer->data = zbe->gmem.arm + zbe->offset; ++ buf->buffer->size = zbe->size_pic; ++ ++ // In this mode we shouldn't have anyone waiting for us ++ // so no need to signal ++ if (alloc_mode == ZC_RESOLVE_ALLOC_VALID) ++ zbe->is_valid = 1; ++ } ++ ++ // Just overwrite - no point in testing ++ buf->data = zbe->gmem.arm + zbe->offset; ++ buf->size = zbe->size_pic; ++ return 0; ++ ++fail: ++ av_buffer_unref(&zbe->user); ++ return AVERROR(ENOMEM); ++} ++ ++int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc) ++{ ++ int rv; ++ ++ // Do alloc if we need it ++ if ((rv = av_rpi_zc_resolve_buffer(frame->buf[0], may_alloc)) != 0) ++ return rv; ++ ++ // If we are a framebuf copy then the alloc can be done but we haven't ++ // imported its results yet ++ if (frame->data[0] == NULL) ++ { ++ const ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]); ++ ++ frame->linesize[0] = zbe->geo.stride_y; ++ frame->linesize[1] = zbe->geo.stride_c; ++ frame->linesize[2] = zbe->geo.stride_c; ++ // abuse: linesize[3] = "stripe stride" ++ // stripe_stride is NOT the stride between slices it is (that / geo.stride_y). ++ // In a general case this makes the calculation an xor and multiply rather ++ // than a divide and multiply ++ if (zbe->geo.stripes > 1) ++ frame->linesize[3] = zbe->geo.stripe_is_yc ? zbe->geo.height_y + zbe->geo.height_c : zbe->geo.height_y; ++ ++ frame->data[0] = frame->buf[0]->data; ++ frame->data[1] = frame->data[0] + (zbe->geo.stripe_is_yc ? zbe->size_y : zbe->size_y * zbe->geo.stripes); ++ if (zbe->geo.planes_c > 1) ++ frame->data[2] = frame->data[1] + zbe->size_c; ++ ++ frame->extended_data = frame->data; ++ // Leave extended buf alone ++ } ++ ++ return 0; ++} ++ ++int av_rpi_zc_set_valid_frame(AVFrame * const frame) ++{ ++ ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]); ++ ++ if (zbe == NULL) ++ return AVERROR(EINVAL); ++ ++ zbe->is_valid = ZC_BUF_VALID; ++ pthread_cond_broadcast(&zbe->cond); ++ ++ return 0; ++} ++ ++int av_rpi_zc_set_broken_frame(AVFrame * const frame) ++{ ++ ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]); ++ ++ if (zbe == NULL) ++ return AVERROR(EINVAL); ++ ++ zbe->is_valid = ZC_BUF_NEVER; ++ pthread_cond_broadcast(&zbe->cond); ++ ++ return 0; ++} ++ ++void av_rpi_zc_set_decoder_pool_size(ZcEnv *const zc, const unsigned int pool_size) ++{ ++ zc->pool_size = pool_size; ++} ++ ++unsigned int av_rpi_zc_get_decoder_pool_size(ZcEnv *const zc) ++{ ++ return zc->pool_size; ++} ++ ++int av_rpi_zc_get_buffer(ZcEnv *const zc, AVFrame * const frame) ++{ ++#if 1 ++ ZcBufEnv * zbe = av_mallocz(sizeof(*zbe)); ++ ++ for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; i++) { ++ frame->buf[i] = NULL; ++ frame->data[i] = NULL; ++ frame->linesize[i] = 0; ++ } ++ ++ if (zbe == NULL) ++ return AVERROR(ENOMEM); ++ ++ if ((frame->buf[0] = av_buffer_create((uint8_t *)zbe, sizeof(*zbe), zc_buf_env_free_cb, zbe, 0)) == NULL) ++ { ++ av_free(zbe); ++ return AVERROR(ENOMEM); ++ } ++ ++ pthread_mutex_init(&zbe->lock, NULL); ++ pthread_cond_init(&zbe->cond, NULL); ++ zbe->zc = zc; ++ atomic_fetch_add(&zc->refcount, 1); ++ ++ zbe->geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height); // Note geometry for later use ++ zbe->size_y = zbe->geo.stride_y * zbe->geo.height_y; ++ zbe->size_c = zbe->geo.stride_c * zbe->geo.height_c; ++ zbe->size_pic = (zbe->size_y + zbe->size_c * zbe->geo.planes_c) * zbe->geo.stripes; ++ ++#else ++ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height); ++ const unsigned int size_y = geo.stride_y * geo.height_y; ++ const unsigned int size_c = geo.stride_c * geo.height_c; ++ const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes; ++ AVBufferRef * buf; ++ unsigned int i; ++ ++// printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic); ++ ++ if ((buf = zc->alloc_buf(zc->pool_env, size_pic, &geo)) == NULL) ++ { ++ av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n"); ++ return AVERROR(ENOMEM); ++ } ++ ++ // Track ++ atomic_fetch_add(&zc->refcount, 1); ++ pic_zbe_ptr(buf)->zc = zc; ++ ++ for (i = 0; i < AV_NUM_DATA_POINTERS; i++) { ++ frame->buf[i] = NULL; ++ frame->data[i] = NULL; ++ frame->linesize[i] = 0; ++ } ++ ++ frame->buf[0] = buf; ++ ++ frame->linesize[0] = geo.stride_y; ++ frame->linesize[1] = geo.stride_c; ++ frame->linesize[2] = geo.stride_c; ++ // abuse: linesize[3] = "stripe stride" ++ // stripe_stride is NOT the stride between slices it is (that / geo.stride_y). ++ // In a general case this makes the calculation an xor and multiply rather ++ // than a divide and multiply ++ if (geo.stripes > 1) ++ frame->linesize[3] = geo.stripe_is_yc ? geo.height_y + geo.height_c : geo.height_y; ++ ++ frame->data[0] = buf->data; ++ frame->data[1] = frame->data[0] + (geo.stripe_is_yc ? size_y : size_y * geo.stripes); ++ if (geo.planes_c > 1) ++ frame->data[2] = frame->data[1] + size_c; ++ ++ frame->extended_data = frame->data; ++ // Leave extended buf alone ++ ++#if RPI_ZC_SAND_8_IN_10_BUF != 0 ++ // *** If we intend to use this for real we will want a 2nd buffer pool ++ frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = zc_pool_buf_alloc(&zc->pool, size_pic); // *** 2 * wanted size - kludge ++#endif ++#endif ++ ++ return 0; ++} ++ ++void av_rpi_zc_env_release(const AVZcEnvPtr zc) ++{ ++ const int n = atomic_fetch_add(&zc->refcount, -1); ++ if (n == 1) // was 1, now 0 ++ { ++ zc->free_pool(zc->pool_env); ++ av_free(zc); ++ } ++} ++ ++AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx, ++ void * pool_env, ++ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, ++ av_rpi_zc_free_pool_fn_t * free_pool_fn) ++{ ++ ZcEnv * zc; ++ ++ if ((zc = av_mallocz(sizeof(ZcEnv))) == NULL) ++ { ++ av_log(logctx, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n"); ++ return NULL; ++ } ++ ++ *zc = (ZcEnv){ ++ .refcount = ATOMIC_VAR_INIT(1), ++ .pool_env = pool_env, ++ .alloc_buf = alloc_buf_fn, ++ .free_pool = free_pool_fn, ++ .pool_size = 0 ++ }; ++ ++ return zc; ++} ++ ++//============================================================================ ++// ++// External ZC initialisation ++ ++#define RPI_GET_BUFFER2 1 ++ ++ ++static int zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags) ++{ ++#if !RPI_GET_BUFFER2 ++ return avcodec_default_get_buffer2(s, frame, flags); ++#else ++ int rv; ++ ++ if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0) ++ { ++// printf("Do default alloc: format=%#x\n", frame->format); ++ rv = avcodec_default_get_buffer2(s, frame, flags); ++ } ++ else if (frame->format == AV_PIX_FMT_YUV420P || ++ av_rpi_is_sand_frame(frame)) ++ { ++ if ((rv = av_rpi_zc_get_buffer(s->opaque, frame)) == 0) ++ rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID); ++ } ++ else ++ { ++ rv = avcodec_default_get_buffer2(s, frame, flags); ++ } ++ ++#if 0 ++ printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__, ++ frame->format, frame->width, frame->height, ++ frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3], ++ frame->data[0], frame->data[1], frame->data[2], ++ frame->buf[0], frame->buf[1], frame->buf[2], ++ av_buffer_get_opaque(frame->buf[0])); ++#endif ++ return rv; ++#endif ++} ++ ++int av_rpi_zc_in_use(const struct AVCodecContext * const s) ++{ ++ return s->get_buffer2 == zc_get_buffer2; ++} ++ ++int av_rpi_zc_init2(struct AVCodecContext * const s, ++ void * pool_env, ++ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, ++ av_rpi_zc_free_pool_fn_t * free_pool_fn) ++{ ++ ZcEnv * zc; ++ ++ av_assert0(!av_rpi_zc_in_use(s)); ++ ++ if ((zc = av_rpi_zc_env_alloc(s, pool_env, alloc_buf_fn, free_pool_fn)) == NULL) ++ return AVERROR(ENOMEM); ++ ++ zc->old = (ZcOldCtxVals){ ++ .opaque = s->opaque, ++ .get_buffer2 = s->get_buffer2, ++ .thread_safe_callbacks = s->thread_safe_callbacks ++ }; ++ ++ s->opaque = zc; ++ s->get_buffer2 = zc_get_buffer2; ++ s->thread_safe_callbacks = 1; ++ return 0; ++} ++ ++void av_rpi_zc_uninit2(struct AVCodecContext * const s) ++{ ++ ZcEnv * const zc = s->opaque; ++ ++ av_assert0(av_rpi_zc_in_use(s)); ++ ++ s->get_buffer2 = zc->old.get_buffer2; ++ s->opaque = zc->old.opaque; ++ s->thread_safe_callbacks = zc->old.thread_safe_callbacks; ++ ++ av_rpi_zc_env_release(zc); ++} ++ +diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h +new file mode 100644 +index 0000000000..f00a7c962c +--- /dev/null ++++ b/libavcodec/rpi_zc.h +@@ -0,0 +1,228 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++#ifndef LIBAVCODEC_RPI_ZC_H ++#define LIBAVCODEC_RPI_ZC_H ++ ++// Zero-Copy frame code for RPi ++// RPi needs Y/U/V planes to be contiguous for display. By default ++// ffmpeg will allocate separated planes so a memcpy is needed before ++// display. This code provides a method a making ffmpeg allocate a single ++// bit of memory for the frame when can then be reference counted until ++// display has finished with it. ++ ++// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame ++// 0 disables ++// *** This option still in development ++// Only works if SAO active ++// Allocates buffers that are twice the required size ++#define RPI_ZC_SAND_8_IN_10_BUF 0 ++ ++struct AVBufferRef; ++struct AVFrame; ++struct AVCodecContext; ++enum AVPixelFormat; ++ ++// "Opaque" pointer to whatever we are using as a buffer reference ++typedef struct AVBufferRef * AVRpiZcRefPtr; ++ ++struct AVZcEnv; ++typedef struct AVZcEnv * AVZcEnvPtr; ++ ++typedef struct AVRpiZcFrameGeometry ++{ ++ unsigned int stride_y; // Luma stride (bytes) ++ unsigned int height_y; // Luma height (lines) ++ unsigned int stride_c; // Chroma stride (bytes) ++ unsigned int height_c; // Chroma stride (lines) ++ unsigned int planes_c; // Chroma plane count (U, V = 2, interleaved = 1) ++ unsigned int stripes; // Number of stripes (sand) ++ unsigned int bytes_per_pel; ++ int stripe_is_yc; // A single stripe is Y then C (false for tall sand) ++ ++ int format; // Requested format ++ unsigned int video_width; // Requested width ++ unsigned int video_height; // Requested height ++} AVRpiZcFrameGeometry; ++ ++// Get expected MMAL geometry for a given format, width & height ++AVRpiZcFrameGeometry av_rpi_zc_frame_geometry( ++ const int format, ++ const unsigned int video_width, const unsigned int video_height); ++ ++//---------------------------------------------------------------------------- ++// ++// Calls that extract info from a ZC frame whether internally or externally ++// allocated ++ ++// Generate a ZC reference to the buffer(s) in this frame ++// If the buffer doesn't appear to be one allocated by ZC ++// then the behaviour depends on maycopy: ++// If maycopy=0 then return NULL ++// If maycopy=1 && the src frame is in a form where we can easily copy ++// the data, then allocate a new buffer and copy the data into it ++// Otherwise return NULL ++// If maycopy == 0 then ZC may be NULL ++AVRpiZcRefPtr av_rpi_zc_ref(void * const logging_context, const AVZcEnvPtr zc, ++ const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy); ++ ++// Unreference the buffer refed/allocated by _zc_ref ++// If fr_ref is NULL then this will NOP ++void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref); ++ ++// Get the vc_handle from the frame ref ++// Returns -1 if ref doesn't look valid ++int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref); ++// Get the vcsm_handle from the frame ref ++// Returns 0 if ref doesn't look valid ++unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref); ++// Get offset from the start of the memory referenced ++// by the vc_handle to valid data ++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref); ++// Length of buffer data ++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref); ++// Get the number of bytes allocated from the frame ref ++// Returns 0 if ref doesn't look valid ++int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref); ++// Geometry this frame was allocated with ++const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref); ++ ++//---------------------------------------------------------------------------- ++// ++// Calls for external frame allocation ++ ++// Callbacks registered in av_rpi_zc_init2 ++ ++// Callback to allocate a buf for a frame ++// The frame itself is generated in the calling code ++// ++// Parameters: ++// pool_env value passed to av-rpi_zc_init2 ++// size size wanted ++// geo geometry of the frame to be allocated ++// Returns: ++// NULL Alloc failed ++// ptr AVBufferBuf* of allocated buffer ++// In most cases av_rpi_zc_buf will be called by this function ++// and this will be the buf returned by that. ++typedef AVBufferRef * av_rpi_zc_alloc_buf_fn_t(void * pool_env, size_t size, ++ const AVRpiZcFrameGeometry * geo); ++ ++// Callback once ffmpeg is completely done with this pool ++// Called once all allocated buffers have been derefed and ffmpegs ref to this ++// pool has been dropped ++typedef void av_rpi_zc_free_pool_fn_t(void * pool_env); ++ ++// Init ZC into a context ++// Sets opaque, get_buffer2, thread_safe_callbacks ++// Use if you want to allocate your own pools and/or create ZC buffers for ++// all decoders ++// RPI HEVC decoders will allocate appropriate VCSM buffers which can be taken ++// apart by av_rpi_zc_xxx calls without this ++int av_rpi_zc_init2(struct AVCodecContext * const s, ++ void * pool_env, av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, ++ av_rpi_zc_free_pool_fn_t * free_pool_fn); ++ ++// Free ZC from a context ++void av_rpi_zc_uninit2(struct AVCodecContext * const s); ++ ++// Get minimum pool size in frames - valid by the time the first alloc request ++// occurs. Takes into account thread requests and DPB sizes derived from SPS ++// rather than just adding a worst case DPB size. ++unsigned int av_rpi_zc_get_decoder_pool_size(const AVZcEnvPtr zc); ++ ++typedef struct av_rpi_zc_buf_fn_tab_s { ++ // This AVBuffer is being freed by ffmpeg - return memory ++ // to external pool. Memory may be, but need not be, unmapped. ++ // v is the ptr passed in av_rpi_zc_buf ++ void (* free)(void * v); ++ ++ // Return appropriate handles / mappings ++ // v is the ptr passed in av_rpi_zc_buf ++ unsigned int (* vcsm_handle)(void * v); ++ unsigned int (* vc_handle)(void * v); ++ void * (* map_arm)(void * v); ++ unsigned int (* map_vc)(void * v); ++} av_rpi_zc_buf_fn_tab_t; ++ ++// Allocate a ZC AVBufferRef and set its callback table ++// Doesn't take a buffer address directly - relies on callbacks to return ++// addresses as they are required. Mappings need not be generated until ++// the map callbacks are called but they should persist from then until ++// the buffer is freed. ++// ++// Parameters: ++// numbytes Size of the buffer ++// addr_offset Offset to first usable byte of buffer (for alignment) ++// normally 0 ++// v Pointer passed to callbacks ++// fn_tab Function table ++AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab); ++ ++// Get v ptr set in in av_rpi_zc_buf ++void * av_rpi_zc_buf_v(AVBufferRef * const buf); ++ ++//---------------------------------------------------------------------------- ++// ++// Mostly internal calls but might possibly be wanted by outside code ++ ++void av_rpi_zc_int_env_freep(AVZcEnvPtr * zc); ++AVZcEnvPtr av_rpi_zc_int_env_alloc(void * const logctx); ++void av_rpi_zc_set_decoder_pool_size(const AVZcEnvPtr zc, const unsigned int pool_size); ++ ++// Test to see if the context is using zc (checks get_buffer2) ++int av_rpi_zc_in_use(const struct AVCodecContext * const s); ++ ++// Get buffer generates placeholders for later alloc ++int av_rpi_zc_get_buffer(const AVZcEnvPtr zc, AVFrame * const frame); ++// Resolve actually does the alloc (noop if already alloced) ++// Set data pointers on a buffer/frame that was copied before the alloc ++// accured ++#define ZC_RESOLVE_FAIL 0 // return error on invalid ++#define ZC_RESOLVE_ALLOC 1 // alloc as invalid ++#define ZC_RESOLVE_WAIT_VALID 2 // wait for valid ++#define ZC_RESOLVE_ALLOC_VALID 3 // alloc as valid ++int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int may_alloc); ++int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc); ++ ++int av_rpi_zc_set_valid_frame(AVFrame * const frame); ++int av_rpi_zc_set_broken_frame(AVFrame * const frame); ++ ++ ++ ++ ++AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx, ++ void * pool_env, ++ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, ++ av_rpi_zc_free_pool_fn_t * free_pool_fn); ++void av_rpi_zc_env_release(const AVZcEnvPtr zc); ++ ++ ++#endif ++ +diff --git a/libavcodec/rpi_zc_frames.h b/libavcodec/rpi_zc_frames.h +new file mode 100644 +index 0000000000..9b7b6536a4 +--- /dev/null ++++ b/libavcodec/rpi_zc_frames.h +@@ -0,0 +1,142 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++#ifndef RPI_ZC_FRAMES_H ++#define RPI_ZC_FRAMES_H ++ ++#define RPI_ONE_BUF 1 ++ ++#include "rpi_mem.h" // for GPU_MEM_PTR_T ++#include "libavutil/frame.h" ++ ++#if !RPI_ONE_BUF ++static inline uint32_t get_vc_address_y(const AVFrame * const frame) { ++ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[0]); ++ return p->vc; ++} ++ ++static inline uint32_t get_vc_address_u(const AVFrame * const frame) { ++ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[1]); ++ return p->vc; ++} ++ ++static inline uint32_t get_vc_address_v(const AVFrame * const frame) { ++ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[2]); ++ return p->vc; ++} ++ ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) { ++ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[0]); ++} ++ ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) { ++ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[1]); ++} ++ ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) { ++ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[2]); ++} ++ ++#else ++ ++static inline int gpu_is_buf1(const AVFrame * const frame) ++{ ++ return frame->buf[1] == NULL; ++} ++ ++static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame) ++{ ++ return av_buffer_get_opaque(frame->buf[0]); ++} ++ ++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n) ++{ ++ return av_buffer_pool_buffer_get_opaque(frame->buf[n]); ++} ++ ++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n) ++{ ++ const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n); ++ return gm->vc + (frame->data[n] - gm->arm); ++} ++ ++ ++static inline uint32_t get_vc_address_y(const AVFrame * const frame) { ++ return get_vc_address3(frame, 0); ++} ++ ++static inline uint32_t get_vc_address_u(const AVFrame * const frame) { ++ return get_vc_address3(frame, 1); ++} ++ ++static inline uint32_t get_vc_address_v(const AVFrame * const frame) { ++ return get_vc_address3(frame, 2); ++} ++ ++#if 0 ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) { ++ if (gpu_is_buf1(frame)) ++ { ++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); ++ g.numbytes = frame->data[1] - frame->data[0]; ++ return g; ++ } ++ else ++ return *gpu_buf3_gmem(frame, 0); ++} ++ ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) { ++ if (gpu_is_buf1(frame)) ++ { ++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); ++ g.arm += frame->data[1] - frame->data[0]; ++ g.vc += frame->data[1] - frame->data[0]; ++ g.numbytes = frame->data[2] - frame->data[1]; // chroma size ++ return g; ++ } ++ else ++ return *gpu_buf3_gmem(frame, 1); ++} ++ ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) { ++ if (gpu_is_buf1(frame)) ++ { ++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); ++ g.arm += frame->data[2] - frame->data[0]; ++ g.vc += frame->data[2] - frame->data[0]; ++ g.numbytes = frame->data[2] - frame->data[1]; // chroma size ++ return g; ++ } ++ else ++ return *gpu_buf3_gmem(frame, 2); ++} ++#endif ++#endif ++ ++#endif +diff --git a/libavcodec/rpivid_hevc.c b/libavcodec/rpivid_hevc.c +new file mode 100644 +index 0000000000..85c5b46d75 +--- /dev/null ++++ b/libavcodec/rpivid_hevc.c +@@ -0,0 +1,2128 @@ ++// FFMPEG HEVC decoder hardware accelerator ++// Andrew Holme, Argon Design Ltd ++// Copyright (c) June 2017 Raspberry Pi Ltd ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "fftools/ffmpeg.h" ++#include "libavutil/avassert.h" ++#include "libavutil/imgutils.h" ++#include "avcodec.h" ++#include "hwconfig.h" ++#include "decode.h" ++ ++#include "hevc.h" ++#include "hevcdec.h" ++#include "rpi_zc.h" ++#include "rpi_mem.h" ++#include "rpi_zc_frames.h" ++#include "rpi_mailbox.h" ++ ++ ++#define OPT_PHASE_TIMING 0 // Generate stats for phase usage ++ ++#define OPT_EMU 0 ++ ++#define TRACE_DEV 0 ++#define TRACE_ENTRY 0 ++ ++#define NUM_SCALING_FACTORS 4064 ++ ++#define AXI_BASE64 0 ++ ++#define PROB_BACKUP ((20<<12) + (20<<6) + (0<<0)) ++#define PROB_RELOAD ((20<<12) + (20<<0) + (0<<6)) ++ ++#define RPIVID_COL_PICS 17 // 16 ref & current ++ ++#define RPIVID_BITBUFS 2 // Bit + Cmd bufs (phase 0 & 1) ++#define RPIVID_BITBUF_SIZE (4 << 20) // Bit + Cmd buf size ++ ++#define RPIVID_COEFFBUFS 3 // PU + Coeff bufs (phase 1 & 2) ++#define RPIVID_COEFFBUF_SIZE (16 << 20) // PU + Coeff buf size ++ ++////////////////////////////////////////////////////////////////////////////// ++// ++// Register offsets ++ ++#define RPI_SPS0 0 ++#define RPI_SPS1 4 ++#define RPI_PPS 8 ++#define RPI_SLICE 12 ++#define RPI_TILESTART 16 ++#define RPI_TILEEND 20 ++#define RPI_SLICESTART 24 ++#define RPI_MODE 28 ++#define RPI_LEFT0 32 ++#define RPI_LEFT1 36 ++#define RPI_LEFT2 40 ++#define RPI_LEFT3 44 ++#define RPI_QP 48 ++#define RPI_CONTROL 52 ++#define RPI_STATUS 56 ++#define RPI_VERSION 60 ++#define RPI_BFBASE 64 ++#define RPI_BFNUM 68 ++#define RPI_BFCONTROL 72 ++#define RPI_BFSTATUS 76 ++#define RPI_PUWBASE 80 ++#define RPI_PUWSTRIDE 84 ++#define RPI_COEFFWBASE 88 ++#define RPI_COEFFWSTRIDE 92 ++#define RPI_SLICECMDS 96 ++#define RPI_BEGINTILEEND 100 ++#define RPI_TRANSFER 104 ++#define RPI_CFBASE 108 ++#define RPI_CFNUM 112 ++#define RPI_CFSTATUS 116 ++ ++#define RPI_PURBASE 0x8000 ++#define RPI_PURSTRIDE 0x8004 ++#define RPI_COEFFRBASE 0x8008 ++#define RPI_COEFFRSTRIDE 0x800C ++#define RPI_NUMROWS 0x8010 ++#define RPI_CONFIG2 0x8014 ++#define RPI_OUTYBASE 0x8018 ++#define RPI_OUTYSTRIDE 0x801C ++#define RPI_OUTCBASE 0x8020 ++#define RPI_OUTCSTRIDE 0x8024 ++#define RPI_STATUS2 0x8028 ++#define RPI_FRAMESIZE 0x802C ++#define RPI_MVBASE 0x8030 ++#define RPI_MVSTRIDE 0x8034 ++#define RPI_COLBASE 0x8038 ++#define RPI_COLSTRIDE 0x803C ++#define RPI_CURRPOC 0x8040 ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++// Unused but left here to illustrate the diffrences between FFmpegs prob ++// structure and the rpivid one ++ ++struct FFM_PROB { ++ uint8_t sao_merge_flag [ 1]; ++ uint8_t sao_type_idx [ 1]; ++ uint8_t split_coding_unit_flag [ 3]; ++ uint8_t cu_transquant_bypass_flag [ 1]; ++ uint8_t skip_flag [ 3]; ++ uint8_t cu_qp_delta [ 3]; ++ uint8_t pred_mode_flag [ 1]; ++ uint8_t part_mode [ 4]; ++ uint8_t prev_intra_luma_pred_flag [ 1]; ++ uint8_t intra_chroma_pred_mode [ 2]; ++ uint8_t merge_flag [ 1]; ++ uint8_t merge_idx [ 1]; ++ uint8_t inter_pred_idc [ 5]; ++ uint8_t ref_idx_l0 [ 2]; ++ uint8_t ref_idx_l1 [ 2]; ++ uint8_t abs_mvd_greater0_flag [ 2]; ++ uint8_t abs_mvd_greater1_flag [ 2]; ++ uint8_t mvp_lx_flag [ 1]; ++ uint8_t no_residual_data_flag [ 1]; ++ uint8_t split_transform_flag [ 3]; ++ uint8_t cbf_luma [ 2]; ++ uint8_t cbf_cb_cr [ 4]; ++ uint8_t transform_skip_flag/*[][]*/ [ 2]; ++ uint8_t explicit_rdpcm_flag/*[][]*/ [ 2]; ++ uint8_t explicit_rdpcm_dir_flag/*[][]*/ [ 2]; ++ uint8_t last_significant_coeff_x_prefix [18]; ++ uint8_t last_significant_coeff_y_prefix [18]; ++ uint8_t significant_coeff_group_flag [ 4]; ++ uint8_t significant_coeff_flag [44]; ++ uint8_t coeff_abs_level_greater1_flag [24]; ++ uint8_t coeff_abs_level_greater2_flag [ 6]; ++ uint8_t log2_res_scale_abs [ 8]; ++ uint8_t res_scale_sign_flag [ 2]; ++ uint8_t cu_chroma_qp_offset_flag [ 1]; ++ uint8_t cu_chroma_qp_offset_idx [ 1]; ++} __attribute__((packed)); ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++struct RPI_PROB { ++ uint8_t SAO_MERGE_FLAG [ 1]; ++ uint8_t SAO_TYPE_IDX [ 1]; ++ uint8_t SPLIT_FLAG [ 3]; ++ uint8_t CU_SKIP_FLAG [ 3]; ++ uint8_t CU_TRANSQUANT_BYPASS_FLAG [ 1]; ++ uint8_t PRED_MODE [ 1]; ++ uint8_t PART_SIZE [ 4]; ++ uint8_t INTRA_PRED_MODE [ 1]; ++ uint8_t CHROMA_PRED_MODE [ 1]; ++ uint8_t MERGE_FLAG_EXT [ 1]; ++ uint8_t MERGE_IDX_EXT [ 1]; ++ uint8_t INTER_DIR [ 5]; ++ uint8_t REF_PIC [ 2]; ++ uint8_t MVP_IDX [ 1]; ++ uint8_t MVD [ 2]; ++ uint8_t QT_ROOT_CBF [ 1]; ++ uint8_t TRANS_SUBDIV_FLAG [ 3]; ++ uint8_t QT_CBF [ 6]; ++ uint8_t DQP [ 2]; ++ uint8_t ONE_FLAG [24]; ++ uint8_t LASTX [18]; ++ uint8_t LASTY [18]; ++ uint8_t SIG_CG_FLAG [ 4]; ++ uint8_t ABS_FLAG [ 6]; ++ uint8_t TRANSFORMSKIP_FLAG [ 2]; ++ uint8_t SIG_FLAG [42]; ++ uint8_t SIG_FLAG_unused [ 2]; ++} __attribute__((packed)); ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++struct RPI_CMD { ++ uint32_t addr; ++ uint32_t data; ++} __attribute__((packed)); ++ ++struct RPI_BIT { ++ int cmd; ++ const void *ptr; ++ int len; ++}; ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++struct RPI_T; ++ ++// Actual addressability is 38bits but we can only alloc in the bottom 32 ++// currently - when passed to rpivid h/w the address is always >> 6 so will ++// fit in 32 bit there ++// At some point we may weant to make this uint64_t ++typedef uint32_t vid_vc_addr_t; ++ ++typedef enum rpivid_decode_state_e { ++ RPIVID_DECODE_NEW = 0, ++ RPIVID_DECODE_START, ++ RPIVID_DECODE_SLICE, ++ RPIVID_DECODE_END, ++} rpivid_decode_state_t; ++ ++#define RPI_PROB_VALS 154U ++#define RPI_PROB_ARRAY_SIZE ((154 + 3) & ~3) ++ ++typedef struct dec_env_s { ++ const AVCodecContext * avctx; ++ ++ rpivid_decode_state_t state; ++ unsigned int decode_order; ++ ++ int phase_no; // Current phase (i.e. the last one we waited for) ++ struct dec_env_s * phase_wait_q_next; ++ sem_t phase_wait; ++ ++ struct RPI_BIT *bit_fifo; ++ struct RPI_CMD *cmd_fifo; ++ unsigned int bit_len, bit_max; ++ unsigned int cmd_len, cmd_max; ++ unsigned int num_slice_msgs; ++ unsigned int PicWidthInCtbsY; ++ unsigned int PicHeightInCtbsY; ++ unsigned int dpbno_col; ++ uint32_t reg_slicestart; ++ unsigned int wpp_entry_x; ++ unsigned int wpp_entry_y; ++ ++ const uint8_t * nal_buffer; ++ size_t nal_size; ++ ++ uint16_t slice_msgs[2*HEVC_MAX_REFS*8+3]; ++ uint8_t scaling_factors[NUM_SCALING_FACTORS]; ++// unsigned int RefPicList[2][HEVC_MAX_REFS]; ++} dec_env_t; ++ ++#define RPIVID_PHASES 3 ++#define RPIVID_PHASE_NEW (RPIVID_PHASES) // Phase before we have inced decode order ++#define RPIVID_PHASE_START (-1) // Phase after we have inced decode_order ++ ++#if OPT_PHASE_TIMING ++static const unsigned int time_thresholds[8] = { ++ 10, 15, 20, 30, 45, 60, 75, 90 ++}; ++#endif ++ ++typedef struct phase_wait_env_s { ++ unsigned int last_order; ++ dec_env_t * q; ++#if OPT_PHASE_TIMING ++ uint64_t phase_time; ++ uint64_t max_phase_time; ++ uint64_t time_in_phase; ++ uint64_t time_out_phase; ++ unsigned int max_time_decode_order; ++ unsigned int time_bins[9]; ++ unsigned int time_bins3[9]; ++ unsigned int time_bins5[9]; ++ uint64_t time_stash[16]; ++ unsigned int i3; ++#endif ++} phase_wait_env_t; // Single linked list of threads waiting for this phase ++ ++typedef struct RPI_T { ++ atomic_int ref_count; ++ sem_t ref_zero; ++ ++ dec_env_t ** dec_envs; ++ AVZcEnvPtr zc; ++ ++ pthread_mutex_t phase_lock; ++ phase_wait_env_t phase_reqs[RPIVID_PHASES]; ++ ++ volatile uint32_t * regs; ++ volatile uint32_t * ints; ++ ++ GPU_MEM_PTR_T gcolbuf; ++ unsigned int col_stride; ++ size_t col_picsize; ++ ++ unsigned int bitbuf_no; ++ sem_t bitbuf_sem; ++ GPU_MEM_PTR_T gbitbufs[RPIVID_BITBUFS]; ++ ++ unsigned int max_pu_msgs; ++ unsigned int coeffbuf_no; ++ sem_t coeffbuf_sem; ++ GPU_MEM_PTR_T gcoeffbufs[RPIVID_COEFFBUFS]; ++ ++ unsigned int decode_order; ++ int mbox_fd; ++ int gpu_init_type; ++} RPI_T; ++ ++#if OPT_PHASE_TIMING ++static uint64_t tus64(void) ++{ ++ struct timespec ts; ++ clock_gettime(CLOCK_MONOTONIC, &ts); ++ return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000; ++} ++#endif ++ ++static inline unsigned int rnd64(unsigned int x) ++{ ++ return (x + 63) & ~63; ++} ++ ++static inline int rpi_sem_wait(sem_t * const sem) ++{ ++ int rv; ++ while ((rv = sem_wait(sem)) != 0 && errno == EINTR) ++ /* Loop */; ++ return rv; ++} ++ ++//============================================================================ ++ ++#define REGS_NAME "/dev/rpivid-hevcmem" ++#define REGS_SIZE 0x10000 ++#define INTS_NAME "/dev/rpivid-intcmem" ++#define INTS_SIZE 0x10000 // 4 is probably enough but we are going to alloc a page anyway ++ ++static volatile uint32_t * map_dev(AVCodecContext * const avctx, const char * const dev_name, size_t size) ++{ ++ void *gpio_map; ++ int mem_fd; ++ ++ /* open /dev/mem */ ++ if ((mem_fd = open(dev_name, O_RDWR|O_SYNC) ) < 0) { ++ av_log(avctx, AV_LOG_WARNING, "can't open %s\n", dev_name); ++ return NULL; ++ } ++ ++ // Now map it ++ gpio_map = mmap( ++ NULL, ++ size, ++ PROT_READ|PROT_WRITE, ++ MAP_SHARED, ++ mem_fd, ++ 0 ++ ); ++ ++ close(mem_fd); // No longer need the FD ++ ++ if (gpio_map == MAP_FAILED) { ++ av_log(avctx, AV_LOG_WARNING, "GPIO mapping failed"); ++ return NULL; ++ } ++ ++ return (volatile uint32_t *)gpio_map; ++} ++ ++static void unmap_devp(volatile uint32_t ** const p_gpio_map, size_t size) ++{ ++ volatile uint32_t * const gpio_map = *p_gpio_map; ++ if (gpio_map != NULL) { ++ *p_gpio_map = NULL; ++ munmap((void *)gpio_map, size); ++ } ++} ++ ++#define MANGLE(x) ((x) &~0xc0000000) // ** If x is ever a 64 bit thing this will need fixing! ++#define MANGLE64(x) (uint32_t)(MANGLE(x)>>6) ++ ++static inline void apb_write_vc_addr(const RPI_T *const rpi, const uint32_t addr, const vid_vc_addr_t data) ++{ ++#if TRACE_DEV ++ printf("W %x %08x\n", addr, MANGLE64(data)); ++#endif ++ ++ rpi->regs[addr >> 2] = MANGLE64(data); ++} ++ ++static inline void apb_write_vc_len(const RPI_T *const rpi, const uint32_t addr, const unsigned int data) ++{ ++#if TRACE_DEV ++ printf("W %x %08x\n", addr, data >> 6); ++#endif ++ ++ rpi->regs[addr >> 2] = data >> 6; // ?? rnd64 - but not currently needed ++} ++ ++static inline void apb_write(const RPI_T * const rpi, const uint32_t addr, const uint32_t data) ++{ ++#if TRACE_DEV ++ printf("W %x %08x\n", addr, data); ++#endif ++ ++ rpi->regs[addr >> 2] = data; ++} ++ ++static inline uint32_t apb_read(const RPI_T * const rpi, const uint32_t addr) ++{ ++ const uint32_t v = rpi->regs[addr >> 2]; ++#if TRACE_DEV ++ printf("R %x (=%x)\n", addr, v); ++#endif ++ return v; ++} ++ ++#define ARG_IC_ICTRL_ACTIVE1_INT_SET 0x00000001 ++#define ARG_IC_ICTRL_ACTIVE1_EDGE_SET 0x00000002 ++#define ARG_IC_ICTRL_ACTIVE1_EN_SET 0x00000004 ++#define ARG_IC_ICTRL_ACTIVE1_STATUS_SET 0x00000008 ++#define ARG_IC_ICTRL_ACTIVE2_INT_SET 0x00000010 ++#define ARG_IC_ICTRL_ACTIVE2_EDGE_SET 0x00000020 ++#define ARG_IC_ICTRL_ACTIVE2_EN_SET 0x00000040 ++#define ARG_IC_ICTRL_ACTIVE2_STATUS_SET 0x00000080 ++ ++static inline void int_wait(const RPI_T * const rpi, const unsigned int phase) ++{ ++ const uint32_t mask_reset = phase == 1 ? ~ARG_IC_ICTRL_ACTIVE2_INT_SET : ~ARG_IC_ICTRL_ACTIVE1_INT_SET; ++ const uint32_t mask_done = phase == 1 ? ARG_IC_ICTRL_ACTIVE1_INT_SET : ARG_IC_ICTRL_ACTIVE2_INT_SET; ++ uint32_t ival; ++ while (((ival = rpi->ints[0]) & mask_done) == 0) { ++ usleep(1000); ++ } ++ rpi->ints[0] = ival & mask_reset; ++} ++ ++#if TRACE_DEV && 0 ++static void apb_dump_regs(const RPI_T * const rpi, uint16_t addr, int num) { ++ int i; ++ ++ for (i=0; iregs[(addr>>2)+i]); ++ ++ if ((i%4)==3 || i+1 == num) ++ printf("\n"); ++ else ++ printf(" "); ++ } ++} ++ ++static void axi_dump(const dec_env_t * const de, uint64_t addr, uint32_t size) { ++ int i; ++ ++ for (i=0; i>2; i++) ++ { ++ if ((i%4)==0) ++ printf("%08x: ", MANGLE(de->gbuf.vc) + (uint32_t)addr + 4*i); ++ ++ printf("%08x", ((uint32_t*)de->gbuf.arm)[(addr>>2)+i]); ++ ++ if ((i%4)==3 || i+1 == size>>2) ++ printf("\n"); ++ else ++ printf(" "); ++ } ++} ++#endif ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++static inline size_t round_up_size(const size_t x) ++{ ++ /* Admit no size < 256 */ ++ const unsigned int n = x < 256 ? 8 : av_log2(x) - 1; ++ ++ return x >= (3 << n) ? 4 << n : (3 << n); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Scaling factors ++ ++static void expand_scaling_list( ++ const unsigned int sizeID, ++ const unsigned int matrixID, ++ uint8_t * const dst0, ++ const uint8_t * const src0, ++ uint8_t dc) ++{ ++ switch (sizeID) { ++ case 0: ++ memcpy(dst0, src0, 16); ++ break; ++ case 1: ++ memcpy(dst0, src0, 64); ++ break; ++ case 2: ++ { ++ uint8_t * d = dst0; ++ for (unsigned int y=0; y != 16; y++) { ++ const uint8_t * s = src0 + (y >> 1) * 8; ++ for (unsigned int x = 0; x != 8; ++x) { ++ *d++ = *s; ++ *d++ = *s++; ++ } ++ } ++ dst0[0] = dc; ++ break; ++ } ++ default: ++ { ++ uint8_t * d = dst0; ++ for (unsigned int y=0; y != 32; y++) { ++ const uint8_t * s = src0 + (y >> 2) * 8; ++ for (unsigned int x = 0; x != 8; ++x) { ++ *d++ = *s; ++ *d++ = *s; ++ *d++ = *s; ++ *d++ = *s++; ++ } ++ } ++ dst0[0] = dc; ++ break; ++ } ++ } ++} ++ ++static void populate_scaling_factors(dec_env_t * const de, const HEVCContext * const s) { ++ // Array of constants for scaling factors ++ static const uint32_t scaling_factor_offsets[4][6] = { ++ // MID0 MID1 MID2 MID3 MID4 MID5 ++ {0x0000, 0x0010, 0x0020, 0x0030, 0x0040, 0x0050}, // SID0 (4x4) ++ {0x0060, 0x00A0, 0x00E0, 0x0120, 0x0160, 0x01A0}, // SID1 (8x8) ++ {0x01E0, 0x02E0, 0x03E0, 0x04E0, 0x05E0, 0x06E0}, // SID2 (16x16) ++ {0x07E0, 0, 0, 0x0BE0, 0, 0}}; // SID3 (32x32) ++ ++ // ffmpeg places SID3,MID1 where matrixID 3 normally is ++ const ScalingList * const sl = ++ s->ps.pps->scaling_list_data_present_flag ? &s->ps.pps->scaling_list ++ : &s->ps.sps->scaling_list; ++ unsigned int mid; ++ ++ for (mid=0; mid<6; mid++) ++ expand_scaling_list(0, mid, ++ de->scaling_factors + scaling_factor_offsets[0][mid], ++ sl->sl[0][mid], 0); ++ for (mid=0; mid<6; mid++) ++ expand_scaling_list(1, mid, ++ de->scaling_factors + scaling_factor_offsets[1][mid], ++ sl->sl[1][mid], 0); ++ for (mid=0; mid<6; mid++) ++ expand_scaling_list(2, mid, ++ de->scaling_factors + scaling_factor_offsets[2][mid], ++ sl->sl[2][mid], ++ sl->sl_dc[0][mid]); ++ // second scaling matrix for 32x32 is at matrixID 3 not 1 in ffmpeg ++ for (mid=0; mid<6; mid += 3) ++ expand_scaling_list(3, mid, ++ de->scaling_factors + scaling_factor_offsets[3][mid], ++ sl->sl[3][mid], ++ sl->sl_dc[1][mid]); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Probabilities ++ ++static const uint8_t prob_init[3][156] = { ++ { ++ 153, 200, 139, 141, 157, 154, 154, 154, ++ 154, 154, 184, 154, 154, 154, 184, 63, ++ 154, 154, 154, 154, 154, 154, 154, 154, ++ 154, 154, 154, 154, 154, 153, 138, 138, ++ 111, 141, 94, 138, 182, 154, 154, 154, ++ 140, 92, 137, 138, 140, 152, 138, 139, ++ 153, 74, 149, 92, 139, 107, 122, 152, ++ 140, 179, 166, 182, 140, 227, 122, 197, ++ 110, 110, 124, 125, 140, 153, 125, 127, ++ 140, 109, 111, 143, 127, 111, 79, 108, ++ 123, 63, 110, 110, 124, 125, 140, 153, ++ 125, 127, 140, 109, 111, 143, 127, 111, ++ 79, 108, 123, 63, 91, 171, 134, 141, ++ 138, 153, 136, 167, 152, 152, 139, 139, ++ 111, 111, 125, 110, 110, 94, 124, 108, ++ 124, 107, 125, 141, 179, 153, 125, 107, ++ 125, 141, 179, 153, 125, 107, 125, 141, ++ 179, 153, 125, 140, 139, 182, 182, 152, ++ 136, 152, 136, 153, 136, 139, 111, 136, ++ 139, 111, 0, 0, }, ++ { ++ 153, 185, 107, 139, 126, 197, 185, 201, ++ 154, 149, 154, 139, 154, 154, 154, 152, ++ 110, 122, 95, 79, 63, 31, 31, 153, ++ 153, 168, 140, 198, 79, 124, 138, 94, ++ 153, 111, 149, 107, 167, 154, 154, 154, ++ 154, 196, 196, 167, 154, 152, 167, 182, ++ 182, 134, 149, 136, 153, 121, 136, 137, ++ 169, 194, 166, 167, 154, 167, 137, 182, ++ 125, 110, 94, 110, 95, 79, 125, 111, ++ 110, 78, 110, 111, 111, 95, 94, 108, ++ 123, 108, 125, 110, 94, 110, 95, 79, ++ 125, 111, 110, 78, 110, 111, 111, 95, ++ 94, 108, 123, 108, 121, 140, 61, 154, ++ 107, 167, 91, 122, 107, 167, 139, 139, ++ 155, 154, 139, 153, 139, 123, 123, 63, ++ 153, 166, 183, 140, 136, 153, 154, 166, ++ 183, 140, 136, 153, 154, 166, 183, 140, ++ 136, 153, 154, 170, 153, 123, 123, 107, ++ 121, 107, 121, 167, 151, 183, 140, 151, ++ 183, 140, 0, 0, }, ++ { ++ 153, 160, 107, 139, 126, 197, 185, 201, ++ 154, 134, 154, 139, 154, 154, 183, 152, ++ 154, 137, 95, 79, 63, 31, 31, 153, ++ 153, 168, 169, 198, 79, 224, 167, 122, ++ 153, 111, 149, 92, 167, 154, 154, 154, ++ 154, 196, 167, 167, 154, 152, 167, 182, ++ 182, 134, 149, 136, 153, 121, 136, 122, ++ 169, 208, 166, 167, 154, 152, 167, 182, ++ 125, 110, 124, 110, 95, 94, 125, 111, ++ 111, 79, 125, 126, 111, 111, 79, 108, ++ 123, 93, 125, 110, 124, 110, 95, 94, ++ 125, 111, 111, 79, 125, 126, 111, 111, ++ 79, 108, 123, 93, 121, 140, 61, 154, ++ 107, 167, 91, 107, 107, 167, 139, 139, ++ 170, 154, 139, 153, 139, 123, 123, 63, ++ 124, 166, 183, 140, 136, 153, 154, 166, ++ 183, 140, 136, 153, 154, 166, 183, 140, ++ 136, 153, 154, 170, 153, 138, 138, 122, ++ 121, 122, 121, 167, 151, 183, 140, 151, ++ 183, 140, 0, 0, }, ++}; ++ ++ ++////////////////////////////////////////////////////////////////////////////// ++// Phase 1 command and bit FIFOs ++ ++// ???? uint16_t addr - put in uint32_t ++static int p1_apb_write(dec_env_t * const de, const uint16_t addr, const uint32_t data) { ++ if (de->cmd_len==de->cmd_max) ++ av_assert0(de->cmd_fifo = realloc(de->cmd_fifo, (de->cmd_max*=2)*sizeof(struct RPI_CMD))); ++ ++#if TRACE_DEV ++ printf("[%02x] %x %x\n", de->cmd_len, addr, data); ++#endif ++ ++ de->cmd_fifo[de->cmd_len].addr = addr; ++ de->cmd_fifo[de->cmd_len].data = data; ++ return de->cmd_len++; ++} ++ ++static void p1_axi_write(dec_env_t * const de, const uint32_t len, const void * const ptr, const int cmd_idx) { ++ if (de->bit_len==de->bit_max) ++ av_assert0(de->bit_fifo = realloc(de->bit_fifo, (de->bit_max*=2)*sizeof(struct RPI_BIT))); ++ de->bit_fifo[de->bit_len].cmd = cmd_idx; ++ de->bit_fifo[de->bit_len].ptr = ptr; ++ de->bit_fifo[de->bit_len].len = len; ++ de->bit_len++; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Write probability and scaling factor memories ++ ++#if 0 ++static void WriteProb(dec_env_t * const de) { ++ int i; ++ const uint8_t *p = (uint8_t *) &de->probabilities; ++ for (i=0; ish.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ? ++ s->sh.slice_type + 1 : 2 - s->sh.slice_type; ++ const uint8_t * p = prob_init[init_type]; ++ const int q = av_clip(s->sh.slice_qp, 0, 51); ++ unsigned int i; ++ ++ for (i = 0; i < RPI_PROB_VALS; i++) { ++ int init_value = p[i]; ++ int m = (init_value >> 4) * 5 - 45; ++ int n = ((init_value & 15) << 3) - 16; ++ int pre = 2 * (((m * q) >> 4) + n) - 127; ++ ++ pre ^= pre >> 31; ++ if (pre > 124) ++ pre = 124 + (pre & 1); ++ dst[i] = pre; ++ } ++ for (i = RPI_PROB_VALS; i != RPI_PROB_ARRAY_SIZE; ++i) { ++ dst[i] = 0; ++ } ++ ++ for (i=0; i < RPI_PROB_ARRAY_SIZE; i+=4) ++ p1_apb_write(de, 0x1000+i, dst[i] + (dst[i+1]<<8) + (dst[i+2]<<16) + (dst[i+3]<<24)); ++ ++} ++ ++ ++static void WriteScalingFactors(dec_env_t * const de) { ++ int i; ++ const uint8_t *p = (uint8_t *) de->scaling_factors; ++ for (i=0; i= bd[i]; i++); // bd[] has num+1 elements; bd[0]=0; see hevc_ps.c ++ return i-1; ++} ++ ++static int ctb_to_slice_w_h (unsigned int ctb, int ctb_size, int width, unsigned int *bd, int num) { ++ if (ctb < bd[num-1]) return ctb_size; ++ else if (width % ctb_size) return width % ctb_size; ++ else return ctb_size; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Handle PU and COEFF stream overflow ++ ++ ++// Returns: ++// -2 Other error ++// -1 Out of coeff space ++// 0 OK ++// 1 Out of PU space ++ ++static int check_status(const RPI_T * const rpi, dec_env_t * const de) { ++ uint32_t status; ++ ++ // this is the definition of successful completion of phase 1 ++ // it assures that status register is zero and all blocks in each tile have completed ++ if (apb_read(rpi, RPI_CFSTATUS) == apb_read(rpi, RPI_CFNUM)) ++ return 0; ++ ++ status = apb_read(rpi, RPI_STATUS); ++ ++ if ((status & 8) != 0) ++ return -1; ++ ++ if ((status & 0x10) != 0) ++ return 1; ++ ++ return -2; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Write STATUS register with expected end CTU address of previous slice ++ ++static void end_previous_slice(dec_env_t * const de, const HEVCContext * const s, const int ctb_addr_ts) { ++ const HEVCPPS * const pps = s->ps.pps; ++ int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY; ++ int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY; ++ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18)); ++} ++ ++static void wpp_pause(dec_env_t * const de, int ctb_row) { ++ p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + 0x25); ++ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); ++ p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1 ? 0x70000 : 0x30000); ++ p1_apb_write(de, RPI_CONTROL, (ctb_row<<16) + 2); ++} ++ ++static void wpp_end_previous_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) { ++ const HEVCPPS *pps = s->ps.pps; ++ int new_x = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY; ++ int new_y = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY; ++ int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY; ++ int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY; ++ if (de->wpp_entry_x<2 && (de->wpp_entry_y2) && de->PicWidthInCtbsY>2) ++ wpp_pause(de, last_y); ++ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18)); ++ if (new_x==2 || de->PicWidthInCtbsY==2 && de->wpp_entry_yps.sps; ++ const HEVCPPS *pps = s->ps.pps; ++ ++ p1_apb_write(de, RPI_SPS0, ++ (sps->log2_min_cb_size << 0) + ++ (sps->log2_ctb_size << 4) + ++ (sps->log2_min_tb_size << 8) + ++ (sps->log2_max_trafo_size << 12) + ++ (sps->bit_depth << 16) + ++ (sps->bit_depth << 20) + ++ (sps->max_transform_hierarchy_depth_intra << 24) + ++ (sps->max_transform_hierarchy_depth_inter << 28)); ++ ++ p1_apb_write(de, RPI_SPS1, ++ (sps->pcm.bit_depth << 0) + ++ (sps->pcm.bit_depth_chroma << 4) + ++ (sps->pcm.log2_min_pcm_cb_size << 8) + ++ (sps->pcm.log2_max_pcm_cb_size << 12) + ++ (sps->separate_colour_plane_flag? 0:sps->chroma_format_idc << 16) + ++ (sps->amp_enabled_flag << 18) + ++ (sps->pcm_enabled_flag << 19) + ++ (sps->scaling_list_enable_flag << 20) + ++ (sps->sps_strong_intra_smoothing_enable_flag << 21)); ++ ++ p1_apb_write(de, RPI_PPS, ++ (sps->log2_ctb_size - pps->diff_cu_qp_delta_depth << 0) + ++ (pps->cu_qp_delta_enabled_flag << 4) + ++ (pps->transquant_bypass_enable_flag << 5) + ++ (pps->transform_skip_enabled_flag << 6) + ++ (pps->sign_data_hiding_flag << 7) + ++ (((pps->cb_qp_offset + s->sh.slice_cb_qp_offset)&255) << 8) + ++ (((pps->cr_qp_offset + s->sh.slice_cr_qp_offset)&255) << 16) + ++ (pps->constrained_intra_pred_flag << 24)); ++ ++ if (s->ps.sps->scaling_list_enable_flag) WriteScalingFactors(de); ++ ++ if (!s->sh.dependent_slice_segment_flag) { ++ int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY; ++ int ctb_row = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY; ++ de->reg_slicestart = (ctb_col<<0) + (ctb_row<<16); ++ } ++ ++ p1_apb_write(de, RPI_SLICESTART, de->reg_slicestart); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++static void write_slice(dec_env_t * const de, const HEVCContext * const s, ++ const unsigned int slice_w, const unsigned int slice_h) { ++ uint32_t u32 = ++ (s->sh.slice_type << 12) ++ + (s->sh.slice_sample_adaptive_offset_flag[0] << 14) ++ + (s->sh.slice_sample_adaptive_offset_flag[1] << 15) ++ + (slice_w << 17) ++ + (slice_h << 24); ++ ++ if (s->sh.slice_type==HEVC_SLICE_B || s->sh.slice_type==HEVC_SLICE_P) u32 |= ++ (s->sh.max_num_merge_cand << 0) ++ + (s->sh.nb_refs[L0] << 4) ++ + (s->sh.nb_refs[L1] << 8); ++ ++ if (s->sh.slice_type==HEVC_SLICE_B) ++ u32 |= s->sh.mvd_l1_zero_flag<<16; ++ p1_apb_write(de, RPI_SLICE, u32); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Wavefront mode ++ ++static void wpp_entry_point(dec_env_t * const de, const HEVCContext * const s, ++ const int do_bte, const int resetQPY, const int ctb_addr_ts) { ++ const HEVCSPS * const sps = s->ps.sps; ++ const HEVCPPS * const pps = s->ps.pps; ++ ++ int ctb_size = 1<log2_ctb_size; ++ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ ++ int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->PicWidthInCtbsY; ++ int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->PicWidthInCtbsY; ++ ++ int endx = de->PicWidthInCtbsY-1; ++ int endy = ctb_row; ++ ++ uint8_t slice_w = ctb_to_slice_w_h(ctb_col, ctb_size, sps->width, pps->col_bd, pps->num_tile_columns); ++ uint8_t slice_h = ctb_to_slice_w_h(ctb_row, ctb_size, sps->height, pps->row_bd, pps->num_tile_rows); ++ ++ p1_apb_write(de, RPI_TILESTART, 0); ++ p1_apb_write(de, RPI_TILEEND, endx + (endy<<16)); ++ ++ if (do_bte) ++ p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16)); ++ ++ write_slice(de, s, slice_w, ctb_row==de->PicHeightInCtbsY-1? slice_h : ctb_size); ++ ++ if (resetQPY) p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp); ++ ++ p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1? 0x60001 : 0x20001); ++ p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16)); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Tiles mode ++ ++static void new_entry_point(dec_env_t * const de, const HEVCContext * const s, ++ const int do_bte, const int resetQPY, const int ctb_addr_ts) { ++ const HEVCSPS * const sps = s->ps.sps; ++ const HEVCPPS * const pps = s->ps.pps; ++ ++ int ctb_col = pps->ctb_addr_ts_to_rs[ctb_addr_ts] % de->PicWidthInCtbsY; ++ int ctb_row = pps->ctb_addr_ts_to_rs[ctb_addr_ts] / de->PicWidthInCtbsY; ++ ++ int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns); ++ int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows); ++ ++ int endx = pps->col_bd[tile_x+1] - 1; ++ int endy = pps->row_bd[tile_y+1] - 1; ++ ++ uint8_t slice_w = ctb_to_slice_w_h(ctb_col, 1<log2_ctb_size, sps->width, pps->col_bd, pps->num_tile_columns); ++ uint8_t slice_h = ctb_to_slice_w_h(ctb_row, 1<log2_ctb_size, sps->height, pps->row_bd, pps->num_tile_rows); ++ ++ p1_apb_write(de, RPI_TILESTART, pps->col_bd[tile_x] + (pps->row_bd[tile_y]<<16)); ++ p1_apb_write(de, RPI_TILEEND, endx + (endy<<16)); ++ ++ if (do_bte) ++ p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16)); ++ ++ write_slice(de, s, slice_w, slice_h); ++ ++ if (resetQPY) ++ p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp); ++ ++ p1_apb_write(de, RPI_MODE, (0xFFFF << 0) ++ + (0x0 << 16) ++ + ((tile_x==pps->num_tile_columns-1) << 17) ++ + ((tile_y==pps->num_tile_rows-1) << 18)); ++ ++ p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16)); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++// Doesn't attempt to remove from context as we should only do this at the end ++// of time or on create error ++static void ++dec_env_delete(dec_env_t * const de) ++{ ++// gpu_free(&de->gbuf); ++ ++ av_freep(&de->cmd_fifo); ++ av_freep(&de->bit_fifo); ++ ++ sem_destroy(&de->phase_wait); ++ av_free(de); ++} ++ ++static dec_env_t * ++dec_env_new(AVCodecContext * const avctx, RPI_T * const rpi) ++{ ++ dec_env_t * const de = av_mallocz(sizeof(*de)); ++ int i; ++ ++ if (de == NULL) ++ return NULL; ++ ++ de->avctx = avctx; ++ de->phase_no = RPIVID_PHASE_NEW; ++ ++ sem_init(&de->phase_wait, 0, 0); ++ ++ if ((de->cmd_fifo = malloc((de->cmd_max=1024)*sizeof(struct RPI_CMD))) == NULL) ++ goto fail; ++ ++ if ((de->bit_fifo = malloc((de->bit_max=1024)*sizeof(struct RPI_BIT))) == NULL) ++ goto fail; ++ ++ pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this ++ for (i = 0; i != avctx->thread_count; ++i) { ++ if (rpi->dec_envs[i] == NULL) ++ { ++ rpi->dec_envs[i] = de; ++ break; ++ } ++ } ++ pthread_mutex_unlock(&rpi->phase_lock); ++ ++ if (i == avctx->thread_count) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to find a slot for hw thread context\n"); ++ goto fail; ++ } ++ ++ return de; ++ ++fail: ++ dec_env_delete(de); ++ return NULL; ++} ++ ++ ++static dec_env_t * ++dec_env_get(AVCodecContext * const avctx, RPI_T * const rpi) ++{ ++ dec_env_t * de = NULL; ++ const int ref_count = atomic_fetch_add(&rpi->ref_count, 1); ++ ++ if (ref_count <= 0) { ++ // Already dead ++ av_log(avctx, AV_LOG_ERROR, "RPIVID called whilst dead\n");; ++ return NULL; ++ } ++ ++ for (int i = 0; i != avctx->thread_count; ++i) { ++ if (rpi->dec_envs[i] == NULL) ++ { ++ de = dec_env_new(avctx, rpi); ++ break; ++ } ++ if (rpi->dec_envs[i]->avctx == avctx) ++ { ++ de = rpi->dec_envs[i]; ++ break; ++ } ++ } ++ return de; ++} ++ ++// Call at end of fn ++// Used to ensure we aren't in a worker thead when killed ++static void ++dec_env_release(RPI_T * const rpi, dec_env_t * const de) ++{ ++ const int n = atomic_fetch_sub(&rpi->ref_count, 1); ++ if (n == 1) { ++ sem_post(&rpi->ref_zero); ++ } ++} ++ ++//---------------------------------------------------------------------------- ++ ++// Wait for a slot in the given phase ++// Any error return is probably fatal ++static int ++wait_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no) ++{ ++ int needs_wait = 0; ++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no; ++ ++ pthread_mutex_lock(&rpi->phase_lock); ++ if (p->last_order + 1 != de->decode_order) { ++ de->phase_wait_q_next = p->q; ++ p->q = de; ++ needs_wait = 1; ++ } ++ pthread_mutex_unlock(&rpi->phase_lock); ++ ++ if (needs_wait) { ++ while (sem_wait(&de->phase_wait) == -1) ++ { ++ int err; ++ if ((err = errno) != EINTR) ++ return AVERROR(err); ++ } ++ } ++ ++ de->phase_no = phase_no; ++ return 0; ++} ++ ++static void ++post_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no) ++{ ++ dec_env_t * next_de = NULL; ++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no; ++ dec_env_t ** q = &p->q; ++ ++ pthread_mutex_lock(&rpi->phase_lock); ++ ++ p->last_order = de->decode_order; ++ while (*q != NULL) { ++ dec_env_t * const t_de = *q; ++ ++ if (t_de->decode_order == p->last_order + 1) { ++ // This is us - remove from Q ++ *q = t_de->phase_wait_q_next; ++ t_de->phase_wait_q_next = NULL; // Tidy ++ next_de = t_de; ++ break; ++ } ++ q = &t_de->phase_wait_q_next; ++ } ++ ++ pthread_mutex_unlock(&rpi->phase_lock); ++ ++ if (next_de != NULL) ++ sem_post(&next_de->phase_wait); ++} ++ ++// Wait & signal stuff s.t. threads in other phases can continue ++static void ++abort_phases(RPI_T * const rpi, dec_env_t * const de) ++{ ++ for (int i = de->phase_no + 1; i < RPIVID_PHASE_NEW; ++i) { ++ wait_phase(rpi, de, i); ++ post_phase(rpi, de, i); ++ } ++ de->phase_no = RPIVID_PHASE_NEW; ++} ++ ++// Start timing for phase ++// Stats only - no actual effect ++static inline void tstart_phase(RPI_T * const rpi, const int phase_no) ++{ ++#if OPT_PHASE_TIMING ++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no; ++ const int64_t now = tus64(); ++ if (p->phase_time != 0) ++ p->time_out_phase += now - p->phase_time; ++ p->phase_time = now; ++#endif ++} ++ ++#if OPT_PHASE_TIMING ++static unsigned int tavg_bin_phase(phase_wait_env_t *const p, const unsigned int avg_n) ++{ ++ uint64_t tsum = 0; ++ unsigned int i; ++ for (i = 0; i != avg_n; ++i) ++ tsum += p->time_stash[(p->i3 - i) & 15]; ++ for (i = 0; i != 9; ++i) { ++ if (time_thresholds[i] * 1000 * avg_n > tsum) ++ break; ++ } ++ return i; ++} ++#endif ++ ++// End timing for phase ++// Stats only - no actual effect ++static inline void tend_phase(RPI_T * const rpi, const int phase_no) ++{ ++#if OPT_PHASE_TIMING ++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no; ++ const uint64_t now = tus64(); ++ const uint64_t in_time = now - p->phase_time; ++ ++ p->time_in_phase += in_time; ++ p->phase_time = now; ++ p->time_stash[p->i3] = in_time; ++ if (in_time > p->max_phase_time) { ++ p->max_phase_time = in_time; ++ p->max_time_decode_order = p->last_order; ++ } ++ ++p->time_bins[tavg_bin_phase(p, 1)]; ++ ++p->time_bins3[tavg_bin_phase(p, 3)]; ++ ++p->time_bins5[tavg_bin_phase(p, 5)]; ++ ++ p->i3 = (p->i3 + 1) & 15; ++#endif ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Start frame ++ ++static int rpi_hevc_start_frame( ++ AVCodecContext * avctx, ++ const uint8_t *buffer, ++ uint32_t size) { ++ ++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; ++ dec_env_t * const de = dec_env_get(avctx, rpi); ++ const HEVCContext * const s = avctx->priv_data; ++ const HEVCSPS * const sps = s->ps.sps; ++ const unsigned int CtbSizeY = 1U << sps->log2_ctb_size; ++ ++#if TRACE_ENTRY ++ printf("<<< %s[%p]\n", __func__, de); ++#endif ++ ++ if (de == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); ++ return -1; ++ } ++ ++ de->phase_no = RPIVID_PHASE_START; ++ de->decode_order = ++rpi->decode_order; // *** atomic? ++ ++ ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame ++ ++ if (de->state != RPIVID_DECODE_NEW && de->state != RPIVID_DECODE_END) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state); ++ return -1; ++ } ++ de->state = RPIVID_DECODE_START; ++ ++ de->PicWidthInCtbsY = (sps->width + CtbSizeY - 1) / CtbSizeY; //7-15 ++ de->PicHeightInCtbsY = (sps->height + CtbSizeY - 1) / CtbSizeY; //7-17 ++ de->bit_len = 0; ++ de->cmd_len = 0; ++ ++#if TRACE_ENTRY ++ printf(">>> %s[%p]\n", __func__, de); ++#endif ++ ++ dec_env_release(rpi, de); ++ return 0; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Slice messages ++ ++static void msg_slice(dec_env_t * const de, const uint16_t msg) { ++ de->slice_msgs[de->num_slice_msgs++] = msg; ++} ++ ++static void program_slicecmds(dec_env_t * const de, const int sliceid) { ++ int i; ++ p1_apb_write(de, RPI_SLICECMDS, de->num_slice_msgs+(sliceid<<8)); ++ for(i=0; i < de->num_slice_msgs; i++) { ++ p1_apb_write(de, 0x4000+4*i, de->slice_msgs[i] & 0xffff); ++ } ++} ++ ++static void pre_slice_decode(dec_env_t * const de, const HEVCContext * const s) { ++ const HEVCSPS * const sps = s->ps.sps; ++ const HEVCPPS * const pps = s->ps.pps; ++ const SliceHeader *sh = &s->sh; ++ ++ int weightedPredFlag, i, rIdx; ++ uint16_t cmd_slice; ++ unsigned int collocated_from_l0_flag; ++ ++ de->num_slice_msgs=0; ++ de->dpbno_col = 0; ++ cmd_slice = 0; ++ if (sh->slice_type==HEVC_SLICE_I) cmd_slice = 1; ++ if (sh->slice_type==HEVC_SLICE_P) cmd_slice = 2; ++ if (sh->slice_type==HEVC_SLICE_B) cmd_slice = 3; ++ ++ if (sh->slice_type!=HEVC_SLICE_I) { ++ cmd_slice += sh->nb_refs[L0]<<2; ++ cmd_slice += sh->nb_refs[L1]<<6; ++ } ++ ++ if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) ++ cmd_slice |= sh->max_num_merge_cand<<11; ++ ++ collocated_from_l0_flag = ++ !sh->slice_temporal_mvp_enabled_flag ? ++ 0 : ++ sh->slice_type == HEVC_SLICE_B ? ++ (sh->collocated_list == L0) : ++ (sh->slice_type==HEVC_SLICE_P); ++ cmd_slice |= collocated_from_l0_flag<<14; ++ ++ if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) { ++ ++ int NoBackwardPredFlag = 1; // Flag to say all reference pictures are from the past ++ for(i=L0; i<=L1; i++) { ++ for(rIdx=0; rIdx nb_refs[i]; rIdx++) { ++ HEVCFrame *f = s->ref->refPicList[i].ref[rIdx]; ++ HEVCFrame *c = s->ref; // CurrentPicture ++ if (c->poc < f->poc) NoBackwardPredFlag = 0; ++ } ++ } ++ ++ if (sps->sps_temporal_mvp_enabled_flag) ++ { ++ const RefPicList *rpl = (sh->slice_type != HEVC_SLICE_B || collocated_from_l0_flag) ? ++ s->ref->refPicList + 0 : ++ s->ref->refPicList + 1; ++ de->dpbno_col = rpl->ref[sh->collocated_ref_idx] - s->DPB; ++ } ++ ++ cmd_slice += NoBackwardPredFlag<<10; ++ msg_slice(de, cmd_slice); ++ ++ // Write reference picture descriptions ++ weightedPredFlag = sh->slice_type==HEVC_SLICE_P? pps->weighted_pred_flag : pps->weighted_bipred_flag; ++ ++ for(i=L0; i<=L1; i++) ++ for(rIdx=0; rIdx nb_refs[i]; rIdx++) { ++ HEVCFrame *f = s->ref->refPicList[i].ref[rIdx]; ++ HEVCFrame *c = s->ref; // CurrentPicture ++ int pic = f - s->DPB; ++ // Make sure pictures are in range 0 to 15 ++ int adjusted_pic = fref->refPicList[i].isLongTerm[rIdx]; ++ msg_slice(de, adjusted_pic+(lt<<4)+(weightedPredFlag<<5)+(weightedPredFlag<<6)); ++ msg_slice(de, f->poc); ++ if (weightedPredFlag) { ++ msg_slice(de, s->sh.luma_log2_weight_denom+(((i?s-> sh.luma_weight_l1: s->sh.luma_weight_l0)[rIdx] &0x1ff)<<3)); ++ msg_slice(de, (i?s-> sh.luma_offset_l1: s->sh.luma_offset_l0)[rIdx] & 0xff); ++ msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][0]&0x1ff)<<3)); ++ msg_slice(de, (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][0]& 0xff); ++ msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][1]&0x1ff)<<3)); ++ msg_slice(de, (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][1]& 0xff); ++ } ++ } ++ } ++ else ++ msg_slice(de, cmd_slice); ++ ++ msg_slice(de, ((sh->beta_offset/2)&15) ++ + (((sh->tc_offset/2)&15) << 4) ++ + (sh->disable_deblocking_filter_flag << 8) ++ + (sh->slice_loop_filter_across_slices_enabled_flag << 9) ++ + (pps->loop_filter_across_tiles_enabled_flag << 10)); // CMD_DEBLOCK ++ ++ msg_slice(de, ((sh->slice_cr_qp_offset&31)<<5) + (sh->slice_cb_qp_offset&31)); // CMD_QPOFF ++} ++ ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++static void rpi_hevc_abort_frame(AVCodecContext * const avctx) { ++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; ++ dec_env_t * const de = dec_env_get(avctx, rpi); ++ ++#if TRACE_ENTRY ++ printf("<<< %s[%p]\n", __func__, de); ++#endif ++ ++ if (de == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); ++ return; ++ } ++ ++ switch (de->state) { ++ case RPIVID_DECODE_NEW: ++ case RPIVID_DECODE_END: ++ // Expected transition ++ break; ++ ++ case RPIVID_DECODE_SLICE: ++ // Error transition ++ av_log(avctx, AV_LOG_INFO, "Error in decode - aborting\n"); ++ break; ++ ++ case RPIVID_DECODE_START: ++ default: ++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state); ++ break; ++ } ++ ++ abort_phases(rpi, de); ++ de->state = RPIVID_DECODE_NEW; ++ ++ dec_env_release(rpi, de); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// End frame ++ ++static int rpi_hevc_end_frame(AVCodecContext * const avctx) { ++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; ++ const HEVCContext * const s = avctx->priv_data; ++ const HEVCPPS * const pps = s->ps.pps; ++ const HEVCSPS * const sps = s->ps.sps; ++ dec_env_t * const de = dec_env_get(avctx, rpi); ++ AVFrame * const f = s->ref->frame; ++ const unsigned int dpbno_cur = s->ref - s->DPB; ++ vid_vc_addr_t cmds_vc; ++ vid_vc_addr_t pu_base_vc; ++ unsigned int pu_stride; ++ vid_vc_addr_t coeff_base_vc; ++ unsigned int coeff_stride; ++ unsigned int i; ++ int rv = 0; ++ int status = 0; ++ int coeffbuf_sem_claimed = 0; ++ ++#if TRACE_ENTRY ++ fprintf("<<< %s[%p]\n", __func__, de); ++#endif ++ ++ if (de == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); ++ return AVERROR_BUG; // Should never happen ++ } ++ ++ if (de->state != RPIVID_DECODE_SLICE) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state); ++ rv = AVERROR_UNKNOWN; ++ goto fail; ++ } ++ de->state = RPIVID_DECODE_END; ++ ++ // End of command compilation ++ { ++ const unsigned int last_x = pps->col_bd[pps->num_tile_columns]-1; ++ const unsigned int last_y = pps->row_bd[pps->num_tile_rows]-1; ++ if (pps->entropy_coding_sync_enabled_flag) { ++ if (de->wpp_entry_x<2 && de->PicWidthInCtbsY>2) ++ wpp_pause(de, last_y); ++ } ++ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18)); ++ } ++ ++ // Phase 0 --------------------------------------------------------------- ++ ++ wait_phase(rpi, de, 0); ++ rpi_sem_wait(&rpi->bitbuf_sem); ++ tstart_phase(rpi, 0); ++ ++ // Copy cmds & bits into gpu side buffer ++ // Layout: CMDS, BITS ++ { ++ uint8_t * const armbase = rpi->gbitbufs[rpi->bitbuf_no].arm; ++ vid_vc_addr_t vcbase = rpi->gbitbufs[rpi->bitbuf_no].vc; ++ unsigned int cmd_bytes = de->cmd_len * sizeof(struct RPI_CMD); ++ ++ uint8_t * p = armbase + rnd64(cmd_bytes); ++ uint8_t * const eobits = armbase + rpi->gbitbufs[rpi->bitbuf_no].numbytes; ++ ++ cmds_vc = vcbase; ++ ++ // Copy all the bits & update bitstream cmds to point at the right bits ++ for (i = 0; i < de->bit_len; ++i) ++ { ++ const unsigned int seg_len = de->bit_fifo[i].len; ++ ++ if (p + seg_len > eobits) { ++ status = -1; ++ break; ++ } ++ ++ memcpy(p, de->bit_fifo[i].ptr, seg_len); ++ de->cmd_fifo[de->bit_fifo[i].cmd].data = MANGLE64((p - armbase) + vcbase); ++ ++ p += rnd64(seg_len); ++ } ++ ++ memcpy(armbase, de->cmd_fifo, cmd_bytes); ++ } ++ ++ if (status == 0) ++ { ++ if (++rpi->bitbuf_no >= RPIVID_BITBUFS) ++ rpi->bitbuf_no = 0; ++ } ++ else ++ { ++ sem_post(&rpi->bitbuf_sem); ++ av_log(avctx, AV_LOG_ERROR, "Out of HEVC bit/cmd memory\n"); ++ rv = AVERROR_BUFFER_TOO_SMALL; ++ } ++ ++ tend_phase(rpi, 0); ++ post_phase(rpi, de, 0); ++ ++ if (status < 0) ++ goto fail; ++ ++ // Phase 1 --------------------------------------------------------------- ++ ++ wait_phase(rpi, de, 1); ++ rpi_sem_wait(&rpi->coeffbuf_sem); ++ coeffbuf_sem_claimed = 1; ++ tstart_phase(rpi, 1); ++ ++ status = 0; ++ for (;;) ++ { ++ // (Re-)allocate PU/COEFF stream space ++ const unsigned int total_size = rpi->gcoeffbufs[rpi->coeffbuf_no].numbytes; ++ unsigned int pu_size; ++ ++ pu_base_vc = rpi->gcoeffbufs[rpi->coeffbuf_no].vc; ++ pu_stride = rnd64(rpi->max_pu_msgs * 2 * de->PicWidthInCtbsY); ++ pu_size = pu_stride * de->PicHeightInCtbsY; ++ ++ if (pu_size >= total_size || status == -1) { ++ GPU_MEM_PTR_T newbuf; ++ ++ if (gpu_malloc_uncached(round_up_size(total_size + 1), &newbuf) != 0) ++ { ++ av_log(avctx, AV_LOG_ERROR, "Failed to reallocate coeffbuf\n"); ++ status = -1; ++ break; ++ } ++ gpu_free(rpi->gcoeffbufs + rpi->coeffbuf_no); ++ rpi->gcoeffbufs[rpi->coeffbuf_no] = newbuf; ++ status = 0; ++ continue; ++ } ++ ++ // Allocate all remaining space to coeff ++ coeff_base_vc = pu_base_vc + pu_size; ++ coeff_stride = ((total_size - pu_size) / de->PicHeightInCtbsY) & ~63; // Round down to multiple of 64 ++ ++ apb_write_vc_addr(rpi, RPI_PUWBASE, pu_base_vc); ++ apb_write_vc_len(rpi, RPI_PUWSTRIDE, pu_stride); ++ apb_write_vc_addr(rpi, RPI_COEFFWBASE, coeff_base_vc); ++ apb_write_vc_len(rpi, RPI_COEFFWSTRIDE, coeff_stride); ++ ++ // Trigger command FIFO ++ apb_write(rpi, RPI_CFNUM, de->cmd_len); ++#if TRACE_DEV && 0 ++ apb_dump_regs(rpi, 0x0, 32); ++ apb_dump_regs(rpi, 0x8000, 24); ++ axi_dump(de, ((uint64_t)a64)<<6, de->cmd_len * sizeof(struct RPI_CMD)); ++#endif ++ apb_write_vc_addr(rpi, RPI_CFBASE, cmds_vc); ++ ++ int_wait(rpi, 1); ++ ++ status = check_status(rpi, de); ++ ++ if (status == -1) ++ continue; ++ else if (status != 1) ++ break; ++ ++ // Status 1 means out of PU space so try again with more ++ // If we ran out of Coeff space then we are out of memory - we could possibly realloc? ++ rpi->max_pu_msgs += rpi->max_pu_msgs / 2; ++ } ++ ++ // Inc inside the phase 1 lock, but only inc if we succeeded otherwise we ++ // may reuse a live buffer when we kick the coeff sem ++ if (status == 0) ++ { ++ if (++rpi->coeffbuf_no >= RPIVID_COEFFBUFS) ++ rpi->coeffbuf_no = 0; ++ } ++ else ++ { ++ if (status == -1) ++ { ++ av_log(avctx, AV_LOG_ERROR, "Out of pu + coeff intermediate memory: pus=%d\n", rpi->max_pu_msgs); ++ rv = AVERROR_BUFFER_TOO_SMALL; ++ } ++ else ++ { ++ av_log(avctx, AV_LOG_WARNING, "Phase 1 decode error\n"); ++ rv = AVERROR_INVALIDDATA; ++ } ++ } ++ ++ tend_phase(rpi, 1); ++ sem_post(&rpi->bitbuf_sem); ++ post_phase(rpi, de, 1); ++ ++ if (status != 0) ++ goto fail; ++ ++ // Phase 2 --------------------------------------------------------------- ++ ++ wait_phase(rpi, de, 2); ++ ++ if ((rv = av_rpi_zc_resolve_frame(f, ZC_RESOLVE_ALLOC)) != 0) ++ { ++ // As we are in phase 2 already here we don't need to worry about ++ // ceoffbuf_no despite the early exit ++ post_phase(rpi, de, 2); ++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate output frame\n"); ++ goto fail; ++ } ++ ++ tstart_phase(rpi, 2); ++ ++ apb_write_vc_addr(rpi, RPI_PURBASE, pu_base_vc); ++ apb_write_vc_len(rpi, RPI_PURSTRIDE, pu_stride); ++ apb_write_vc_addr(rpi, RPI_COEFFRBASE, coeff_base_vc); ++ apb_write_vc_len(rpi, RPI_COEFFRSTRIDE, coeff_stride); ++ ++ apb_write_vc_addr(rpi, RPI_OUTYBASE, get_vc_address_y(f)); ++ apb_write_vc_addr(rpi, RPI_OUTCBASE, get_vc_address_u(f)); ++ apb_write_vc_len(rpi, RPI_OUTYSTRIDE, f->linesize[3] * 128); ++ apb_write_vc_len(rpi, RPI_OUTCSTRIDE, f->linesize[3] * 128); ++ ++ // Keep the last thing we resolved as fallback for any ref we fail to ++ // resolve. As a final fallback use our current frame. The pels might ++ // not be there yet but at least the memory is valid. ++ // ++ // Attempt to resolve the entire DPB - we could note what we have used ++ // in ref lists but probably simpler and more reliable to set the whole thing ++ { ++ AVFrame * fallback_frame = f; ++ for (i = 0; i != 16; ++i) { ++ // Avoid current frame ++ const HEVCFrame * hevc_fr = (s->DPB + i >= s->ref) ? s->DPB + i + 1 : s->DPB + i; ++ AVFrame * fr = hevc_fr->frame; ++ ++ if (fr != NULL && ++ av_rpi_zc_resolve_frame(fr, ZC_RESOLVE_FAIL) == 0) ++ { ++ fallback_frame = fr; ++ } ++ else ++ { ++ fr = fallback_frame; ++ } ++ ++ apb_write_vc_addr(rpi, 0x9000+16*i, get_vc_address_y(fr)); ++ apb_write(rpi, 0x9004+16*i, 0); ++ apb_write_vc_addr(rpi, 0x9008+16*i, get_vc_address_u(fr)); ++ apb_write(rpi, 0x900C+16*i, 0); ++ } ++ } ++ ++ apb_write(rpi, RPI_CONFIG2, ++ (sps->bit_depth << 0) // BitDepthY ++ + (sps->bit_depth << 4) // BitDepthC ++ + ((sps->bit_depth>8) << 8) // BitDepthY ++ + ((sps->bit_depth>8) << 9) // BitDepthC ++ + (sps->log2_ctb_size <<10) ++ + (pps->constrained_intra_pred_flag <<13) ++ + (sps->sps_strong_intra_smoothing_enable_flag<<14) ++ + (sps->sps_temporal_mvp_enabled_flag <<15) ++ + (pps->log2_parallel_merge_level <<16) ++ + (s->sh.slice_temporal_mvp_enabled_flag <<19) ++ + (sps->pcm.loop_filter_disable_flag <<20) ++ + ((pps->cb_qp_offset&31) <<21) ++ + ((pps->cr_qp_offset&31) <<26)); ++ ++ apb_write(rpi, RPI_FRAMESIZE, (sps->height<<16) + sps->width); ++ apb_write(rpi, RPI_CURRPOC, s->poc); ++ ++ // collocated reads/writes ++ if (sps->sps_temporal_mvp_enabled_flag) { ++ av_assert0(de->dpbno_col < RPIVID_COL_PICS); ++ av_assert0(dpbno_cur < RPIVID_COL_PICS); ++ ++ apb_write_vc_len(rpi, RPI_COLSTRIDE, rpi->col_stride); ++ apb_write_vc_len(rpi, RPI_MVSTRIDE, rpi->col_stride); ++ apb_write_vc_addr(rpi, RPI_MVBASE, rpi->gcolbuf.vc + dpbno_cur * rpi->col_picsize); ++ apb_write_vc_addr(rpi, RPI_COLBASE, rpi->gcolbuf.vc + de->dpbno_col * rpi->col_picsize); ++ } ++ ++#if TRACE_DEV && 0 ++ apb_dump_regs(rpi, 0x0, 32); ++ apb_dump_regs(rpi, 0x8000, 24); ++#endif ++ ++ apb_write(rpi, RPI_NUMROWS, de->PicHeightInCtbsY); ++ apb_read(rpi, RPI_NUMROWS); // Read back to confirm write has reached block ++ ++ int_wait(rpi, 2); ++ ++ tend_phase(rpi, 2); ++ coeffbuf_sem_claimed = 0; ++ sem_post(&rpi->coeffbuf_sem); ++ // Set valid here to avoid race in resolving in any pending phase 2 ++ av_rpi_zc_set_valid_frame(f); ++ ++ post_phase(rpi, de, 2); ++ ++ // Flush frame for CPU access ++ // Arguably the best place would be at the start of phase 2 but here ++ // will overlap with the wait ++ // ++ // * Even better would be to have better lock/unlock control in ZC for external access ++ if (rpi->gpu_init_type == GPU_INIT_GPU) // * CMA is currently always uncached ++ { ++ rpi_cache_buf_t cbuf; ++ rpi_cache_flush_env_t * const fe = rpi_cache_flush_init(&cbuf); ++ rpi_cache_flush_add_frame(fe, f, RPI_CACHE_FLUSH_MODE_INVALIDATE); ++ rpi_cache_flush_finish(fe); ++ } ++ ++#if TRACE_ENTRY ++ printf(">>> %s[%p] OK\n", __func__, de); ++#endif ++ ++ dec_env_release(rpi, de); ++ return 0; ++ ++fail: ++ av_rpi_zc_set_broken_frame(f); ++ if (coeffbuf_sem_claimed) ++ sem_post(&rpi->coeffbuf_sem); ++ abort_phases(rpi, de); // Dummy any unresolved phases ++ ++#if TRACE_ENTRY ++ printf(">>> %s[%p] FAIL\n", __func__, de); ++#endif ++ ++ dec_env_release(rpi, de); ++ return rv; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++ ++#if TRACE_DEV ++static void dump_data(const uint8_t * p, size_t len) ++{ ++ size_t i; ++ for (i = 0; i < len; i += 16) { ++ size_t j; ++ printf("%04x", i); ++ for (j = 0; j != 16; ++j) { ++ printf("%c%02x", i == 8 ? '-' : ' ', p[i+j]); ++ } ++ printf("\n"); ++ } ++} ++#endif ++ ++#if OPT_EMU ++static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx) ++{ ++ unsigned int z = 0; ++ while (idx--) { ++ if (*b++ == 0) { ++ ++z; ++ if (z >= 2 && *b == 3) { ++ ++b; ++ z = 0; ++ } ++ } ++ else { ++ z = 0; ++ } ++ } ++ return b; ++} ++#endif ++ ++static void WriteBitstream(dec_env_t * const de, const HEVCContext * const s) { ++ const int rpi_use_emu = OPT_EMU; // FFmpeg removes emulation prevention bytes ++ const int offset = 0; // Always 64-byte aligned in sim, need not be on real hardware ++ const GetBitContext *gb = &s->HEVClc->gb; ++ ++#if OPT_EMU ++ const uint8_t *ptr = ptr_from_index(de->nal_buffer, gb->index/8 + 1); ++ const int len = de->nal_size - (ptr - de->nal_buffer); ++#else ++ const int len = 1 + gb->size_in_bits/8 - gb->index/8; ++ const void *ptr = &gb->buffer[gb->index/8]; ++#endif ++ ++#if TRACE_DEV ++ printf("Index=%d, /8=%#x\n", gb->index, gb->index/8); ++ dump_data(de->nal_buffer, 128); ++#endif ++ ++ p1_axi_write(de, len, ptr, p1_apb_write(de, RPI_BFBASE, 0)); // BFBASE set later ++ p1_apb_write(de, RPI_BFNUM, len); ++ p1_apb_write(de, RPI_BFCONTROL, offset + (1<<7)); // Stop ++ p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu<<6)); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Wavefront mode ++ ++static void wpp_decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) ++{ ++ const HEVCPPS * const pps = s->ps.pps; ++ ++ int i, resetQPY=1; ++ int indep = !s->sh.dependent_slice_segment_flag; ++ int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY; ++ ++ if (ctb_addr_ts) ++ wpp_end_previous_slice(de, s, ctb_addr_ts); ++ pre_slice_decode(de, s); ++ WriteBitstream(de, s); ++ if (ctb_addr_ts==0 || indep || de->PicWidthInCtbsY==1) ++ WriteProb(de, s); ++ else if (ctb_col==0) ++ p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); ++ else ++ resetQPY=0; ++ program_slicecmds(de, s->slice_idx); ++ new_slice_segment(de, s); ++ wpp_entry_point(de, s, indep, resetQPY, ctb_addr_ts); ++ for (i=0; ish.num_entry_point_offsets; i++) { ++ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY; ++ int last_x = de->PicWidthInCtbsY-1; ++ if (de->PicWidthInCtbsY>2) ++ wpp_pause(de, ctb_row); ++ p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + (last_x<<5) + 2); ++ if (de->PicWidthInCtbsY==2) ++ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); ++ if (de->PicWidthInCtbsY==1) ++ WriteProb(de, s); ++ else ++ p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); ++ ctb_addr_ts += pps->column_width[0]; ++ wpp_entry_point(de, s, 0, 1, ctb_addr_ts); ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Tiles mode ++ ++static void decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) { ++ const HEVCPPS * const pps = s->ps.pps; ++ int i, resetQPY; ++ ++ if (ctb_addr_ts) end_previous_slice(de, s, ctb_addr_ts); ++ pre_slice_decode(de, s); ++ WriteBitstream(de, s); ++ resetQPY = ctb_addr_ts==0 ++ || pps->tile_id[ctb_addr_ts]!=pps->tile_id[ctb_addr_ts-1] ++ || !s->sh.dependent_slice_segment_flag; ++ if (resetQPY) WriteProb(de, s); ++ program_slicecmds(de, s->slice_idx); ++ new_slice_segment(de, s); ++ new_entry_point(de, s, !s->sh.dependent_slice_segment_flag, resetQPY, ctb_addr_ts); ++ for (i=0; ish.num_entry_point_offsets; i++) { ++ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ int ctb_col = ctb_addr_rs % de->PicWidthInCtbsY; ++ int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY; ++ int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns); ++ int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows); ++ int last_x = pps->col_bd[tile_x+1]-1; ++ int last_y = pps->row_bd[tile_y+1]-1; ++ p1_apb_write(de, RPI_STATUS, 2 + (last_x<<5) + (last_y<<18)); ++ WriteProb(de, s); ++ ctb_addr_ts += pps->column_width[tile_x] * pps->row_height[tile_y]; ++ new_entry_point(de, s, 0, 1, ctb_addr_ts); ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++static int cabac_start_align(HEVCContext *s) ++{ ++ GetBitContext *gb = &s->HEVClc->gb; ++ skip_bits(gb, 1); ++ align_get_bits(gb); ++ // Should look at getting rid of this ++ return ff_init_cabac_decoder(&s->HEVClc->cc, ++ gb->buffer + get_bits_count(gb) / 8, ++ (get_bits_left(gb) + 7) / 8); ++} ++ ++static int rpi_hevc_decode_slice( ++ AVCodecContext *avctx, ++ const uint8_t *buffer, ++ uint32_t size) ++{ ++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; ++ HEVCContext * const s = avctx->priv_data; ++ dec_env_t * const de = dec_env_get(avctx, rpi); ++ const HEVCPPS *pps = s->ps.pps; ++ int ctb_addr_ts = pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]; ++ ++#if TRACE_ENTRY ++ printf("<<< %s[%p]\n", __func__, de); ++#endif ++ if (de == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); ++ return -1; ++ } ++ ++ if (de->state != RPIVID_DECODE_START && de->state != RPIVID_DECODE_SLICE) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state); ++ return -1; ++ } ++ de->state = RPIVID_DECODE_SLICE; ++ ++ de->nal_buffer = buffer; ++ de->nal_size = size; ++ ++#if !OPT_EMU ++// ff_hevc_cabac_init(s, ctb_addr_ts); ++ cabac_start_align(s); ++#endif ++ if (s->ps.sps->scaling_list_enable_flag) ++ populate_scaling_factors(de, s); ++ pps->entropy_coding_sync_enabled_flag? wpp_decode_slice(de, s, ctb_addr_ts) ++ : decode_slice(de, s, ctb_addr_ts); ++#if TRACE_ENTRY ++ printf(">>> %s[%p]\n", __func__, de); ++#endif ++ dec_env_release(rpi, de); ++ return 0; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++static int rpivid_retrieve_data(void *logctx, AVFrame *frame) ++{ ++ int rv; ++ if ((rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_WAIT_VALID)) != 0) ++ av_log(logctx, AV_LOG_ERROR, "Unable to resolve output frame\n"); ++ return rv; ++} ++ ++static int rpivid_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame) ++{ ++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; ++ HEVCContext * const s = avctx->priv_data; ++ // Frame buffering + 1 output. Would need thread_count extra but we now ++ // alloc at the start of phase 2 so that is the only thread we need the ++ // extra buffer for. ++ const unsigned int pool_req = s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering + 1; ++ int rv; ++ ++ if (av_rpi_zc_in_use(avctx)) ++ { ++ const AVZcEnvPtr zc = avctx->opaque; ++ av_rpi_zc_set_decoder_pool_size(zc, pool_req); ++ rv = av_rpi_zc_get_buffer(zc, frame); // get_buffer2 would alloc ++ } ++ else ++ { ++ if (rpi->zc == NULL) { ++ pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this ++ // Alloc inside lock to make sure we only ever alloc one ++ if (rpi->zc == NULL) { ++ rpi->zc = av_rpi_zc_int_env_alloc(s); ++ } ++ pthread_mutex_unlock(&rpi->phase_lock); ++ } ++ av_rpi_zc_set_decoder_pool_size(rpi->zc, pool_req); // Ignored by local allocator, but set anyway :-) ++ rv = (rpi->zc == NULL) ? AVERROR(ENOMEM) : ++ av_rpi_zc_get_buffer(rpi->zc, frame); ++ } ++ ++ if (rv == 0 && ++ (rv = ff_attach_decode_data(frame)) < 0) ++ { ++ av_frame_unref(frame); ++ } ++ ++ if (rv == 0) ++ { ++ FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data; ++ fdd->post_process = rpivid_retrieve_data; ++ } ++ ++ return rv; ++} ++ ++#if OPT_PHASE_TIMING ++static void log_bin_phase(AVCodecContext * const avctx, const unsigned int * const bins) ++{ ++ av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d %7d\n", ++ bins[0], bins[1], bins[2], bins[3], ++ bins[4], bins[5], bins[6], bins[7], bins[8]); ++} ++#endif ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++static int rpi_hevc_free(AVCodecContext *avctx) { ++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; ++ ++#if TRACE_ENTRY ++ printf("<<< %s\n", __func__); ++#endif ++ ++ dec_env_release(rpi, NULL); ++ ++ // Wait for everything else to stop ++ { ++ struct timespec tt; ++ clock_gettime(CLOCK_REALTIME, &tt); ++ tt.tv_sec += 2; ++ while (sem_timedwait(&rpi->ref_zero, &tt) == -1) { ++ const int err = errno; ++ if (err == ETIMEDOUT) { ++ av_log(avctx, AV_LOG_FATAL, "Rpivid worker threads still running\n"); ++ return -1; ++ } ++ if (err != EINTR) { ++ av_log(avctx, AV_LOG_ERROR, "Unexpected error %d waiting for work thread to stop\n", err); ++ break; ++ } ++ } ++ } ++ ++#if OPT_PHASE_TIMING ++ { ++ unsigned int i; ++ for (i = 0; i != RPIVID_PHASES; ++i) { ++ const phase_wait_env_t * const p = rpi->phase_reqs + i; ++ av_log(avctx, AV_LOG_INFO, "Phase %u: In %3u.%06u, Out %3u.%06u\n", i, ++ (unsigned int)(p->time_in_phase / 1000000), (unsigned int)(p->time_in_phase % 1000000), ++ (unsigned int)(p->time_out_phase / 1000000), (unsigned int)(p->time_out_phase % 1000000)); ++ av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d >\n", ++ time_thresholds[0], time_thresholds[1], time_thresholds[2], time_thresholds[3], ++ time_thresholds[4], time_thresholds[5], time_thresholds[6], time_thresholds[7]); ++ log_bin_phase(avctx, p->time_bins); ++ log_bin_phase(avctx, p->time_bins3); ++ log_bin_phase(avctx, p->time_bins5); ++ av_log(avctx, AV_LOG_INFO, "Longest duraction: %ums @ frame %u\n", ++ (unsigned int)(p->max_phase_time / 1000), ++ p->max_time_decode_order); ++ } ++ av_log(avctx, AV_LOG_INFO, "PU max=%d\n", rpi->max_pu_msgs); ++ } ++#endif ++ ++ if (rpi->dec_envs != NULL) ++ { ++ for (int i; i < avctx->thread_count && rpi->dec_envs[i] != NULL; ++i) { ++ dec_env_delete(rpi->dec_envs[i]); ++ } ++ av_freep(&rpi->dec_envs); ++ } ++ ++ av_rpi_zc_int_env_freep(&rpi->zc); ++ ++ gpu_free(&rpi->gcolbuf); ++ ++ for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) { ++ gpu_free(rpi->gbitbufs + i); ++ } ++ for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) { ++ gpu_free(rpi->gcoeffbufs + i); ++ } ++ ++ unmap_devp(&rpi->regs, REGS_SIZE); ++ unmap_devp(&rpi->ints, INTS_SIZE); ++ ++ if (rpi->gpu_init_type > 0) ++ rpi_mem_gpu_uninit(); ++ ++ if (rpi->mbox_fd >= 0) { ++ mbox_release_clock(rpi->mbox_fd); ++ mbox_close(rpi->mbox_fd); ++ } ++ ++ sem_destroy(&rpi->ref_zero); ++ sem_destroy(&rpi->coeffbuf_sem); ++ sem_destroy(&rpi->bitbuf_sem); ++ ++#if TRACE_ENTRY ++ printf(">>> %s\n", __func__); ++#endif ++ return 0; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++static int rpi_hevc_init(AVCodecContext *avctx) { ++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; ++// const char *err; ++ ++#if TRACE_ENTRY ++ printf("<<< %s\n", __func__); ++#endif ++ ++ if (avctx->width>4096 || avctx->height>4096) { ++ av_log(NULL, AV_LOG_FATAL, "Picture size %dx%d exceeds 4096x4096 maximum for HWAccel\n", avctx->width, avctx->height); ++ return AVERROR(ENOTSUP); ++ } ++ ++ memset(rpi, 0, sizeof(*rpi)); ++ ++ rpi->mbox_fd = -1; ++ rpi->decode_order = 0; ++ ++ // Initial PU/COEFF stream buffer split chosen as worst case seen so far ++ rpi->max_pu_msgs = 768; // 7.2 says at most 1611 messages per CTU ++ ++ ++ atomic_store(&rpi->ref_count, 1); ++ sem_init(&rpi->ref_zero, 0, 0); ++ ++ sem_init(&rpi->bitbuf_sem, 0, RPIVID_BITBUFS); ++ sem_init(&rpi->coeffbuf_sem, 0, RPIVID_COEFFBUFS); ++ ++ pthread_mutex_init(&rpi->phase_lock, NULL); ++ ++ if ((rpi->mbox_fd = mbox_open()) < 0) ++ { ++ av_log(avctx, AV_LOG_ERROR, "Failed to open mailbox\n"); ++ goto fail; ++ } ++ mbox_request_clock(rpi->mbox_fd); ++ ++ if ((rpi->regs = map_dev(avctx, REGS_NAME, REGS_SIZE)) == NULL || ++ (rpi->ints = map_dev(avctx, INTS_NAME, INTS_SIZE)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to open rpivid devices\n"); ++ goto fail; ++ } ++ ++ if ((rpi->gpu_init_type = rpi_mem_gpu_init(0)) < 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to init GPU\n"); ++ goto fail; ++ } ++ ++ if ((rpi->dec_envs = av_mallocz(sizeof(dec_env_t *) * avctx->thread_count)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d dec envs\n", avctx->thread_count); ++ goto fail; ++ } ++ ++ rpi->col_stride = rnd64(avctx->width); ++ rpi->col_picsize = rpi->col_stride * (((avctx->height + 63) & ~63) >> 4); ++ if (gpu_malloc_uncached(rpi->col_picsize * RPIVID_COL_PICS, &rpi->gcolbuf) != 0) ++ { ++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate col mv buffer\n"); ++ goto fail; ++ } ++ ++ for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) { ++ if (gpu_malloc_uncached(RPIVID_BITBUF_SIZE, rpi->gbitbufs + i) != 0) ++ { ++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate bitbuf %d\n", i); ++ goto fail; ++ } ++ } ++ ++ for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) { ++ if (gpu_malloc_uncached(RPIVID_COEFFBUF_SIZE, rpi->gcoeffbufs + i) != 0) ++ { ++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate coeffbuf %d\n", i); ++ goto fail; ++ } ++ } ++ ++ av_log(avctx, AV_LOG_INFO, "RPI HEVC h/w accel init OK\n"); ++ ++ return 0; ++ ++fail: ++ rpi_hevc_free(avctx); ++ return AVERROR_EXTERNAL; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++const AVHWAccel ff_hevc_rpi4_8_hwaccel = { ++ .name = "hevc_rpi4_8", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_HEVC, ++ .pix_fmt = AV_PIX_FMT_RPI4_8, ++ .alloc_frame = rpivid_hevc_alloc_frame, ++ .start_frame = rpi_hevc_start_frame, ++ .end_frame = rpi_hevc_end_frame, ++ .abort_frame = rpi_hevc_abort_frame, ++ .decode_slice = rpi_hevc_decode_slice, ++ .init = rpi_hevc_init, ++ .uninit = rpi_hevc_free, ++ .priv_data_size = sizeof(RPI_T), ++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, ++}; ++ ++const AVHWAccel ff_hevc_rpi4_10_hwaccel = { ++ .name = "hevc_rpi4_10", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_HEVC, ++ .pix_fmt = AV_PIX_FMT_RPI4_10, ++ .alloc_frame = rpivid_hevc_alloc_frame, ++ .start_frame = rpi_hevc_start_frame, ++ .end_frame = rpi_hevc_end_frame, ++ .abort_frame = rpi_hevc_abort_frame, ++ .decode_slice = rpi_hevc_decode_slice, ++ .init = rpi_hevc_init, ++ .uninit = rpi_hevc_free, ++ .priv_data_size = sizeof(RPI_T), ++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, ++}; ++ +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index 4b2679eb38..8d80d19788 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -21,6 +21,7 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + ++#include + #include + #include + #include +@@ -29,57 +30,88 @@ + #include + #include "libavcodec/avcodec.h" + #include "libavcodec/internal.h" ++#include "libavutil/avassert.h" + #include "libavutil/pixdesc.h" ++#include "libavutil/hwcontext.h" + #include "v4l2_context.h" + #include "v4l2_buffers.h" + #include "v4l2_m2m.h" ++#include "v4l2_req_dmabufs.h" ++#include "weak_link.h" + + #define USEC_PER_SEC 1000000 +-static AVRational v4l2_timebase = { 1, USEC_PER_SEC }; ++static const AVRational v4l2_timebase = { 1, USEC_PER_SEC }; + +-static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf) ++static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx) + { +- return V4L2_TYPE_IS_OUTPUT(buf->context->type) ? +- container_of(buf->context, V4L2m2mContext, output) : +- container_of(buf->context, V4L2m2mContext, capture); ++ return V4L2_TYPE_IS_OUTPUT(ctx->type) ? ++ container_of(ctx, V4L2m2mContext, output) : ++ container_of(ctx, V4L2m2mContext, capture); + } + +-static inline AVCodecContext *logger(V4L2Buffer *buf) ++static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf) + { +- return buf_to_m2mctx(buf)->avctx; ++ return ctx_to_m2mctx(buf->context); + } + +-static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf) ++static inline AVCodecContext *logger(const V4L2Buffer * const buf) + { +- V4L2m2mContext *s = buf_to_m2mctx(avbuf); ++ return buf_to_m2mctx(buf)->avctx; ++} + +- if (s->avctx->pkt_timebase.num) +- return s->avctx->pkt_timebase; +- return s->avctx->time_base; ++static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf) ++{ ++ const V4L2m2mContext *s = buf_to_m2mctx(avbuf); ++ const AVRational tb = s->avctx->pkt_timebase.num ? ++ s->avctx->pkt_timebase : ++ s->avctx->time_base; ++ return tb.num && tb.den ? tb : v4l2_timebase; + } + +-static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts) ++static inline struct timeval tv_from_int(const int64_t t) + { +- int64_t v4l2_pts; ++ return (struct timeval){ ++ .tv_usec = t % USEC_PER_SEC, ++ .tv_sec = t / USEC_PER_SEC ++ }; ++} + +- if (pts == AV_NOPTS_VALUE) +- pts = 0; ++static inline int64_t int_from_tv(const struct timeval t) ++{ ++ return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec; ++} + ++static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts) ++{ + /* convert pts to v4l2 timebase */ +- v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); +- out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC; +- out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC; ++ const int64_t v4l2_pts = ++ pts == AV_NOPTS_VALUE ? 0 : ++ av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); ++ out->buf.timestamp = tv_from_int(v4l2_pts); + } + +-static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf) ++static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf) + { +- int64_t v4l2_pts; +- ++ const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp); ++ return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE; ++#if 0 + /* convert pts back to encoder timebase */ +- v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC + +- avbuf->buf.timestamp.tv_usec; ++ return ++ avbuf->context->no_pts_rescale ? v4l2_pts : ++ v4l2_pts == 0 ? AV_NOPTS_VALUE : ++ av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); ++#endif ++} + +- return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); ++static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length) ++{ ++ if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { ++ out->planes[plane].bytesused = bytesused; ++ out->planes[plane].length = length; ++ } else { ++ out->buf.bytesused = bytesused; ++ out->buf.length = length; ++ } + } + + static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf) +@@ -116,6 +148,105 @@ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf) + return AVCOL_PRI_UNSPECIFIED; + } + ++static void v4l2_set_color(V4L2Buffer *buf, ++ const enum AVColorPrimaries avcp, ++ const enum AVColorSpace avcs, ++ const enum AVColorTransferCharacteristic avxc) ++{ ++ enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT; ++ enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT; ++ enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT; ++ ++ switch (avcp) { ++ case AVCOL_PRI_BT709: ++ cs = V4L2_COLORSPACE_REC709; ++ ycbcr = V4L2_YCBCR_ENC_709; ++ break; ++ case AVCOL_PRI_BT470M: ++ cs = V4L2_COLORSPACE_470_SYSTEM_M; ++ ycbcr = V4L2_YCBCR_ENC_601; ++ break; ++ case AVCOL_PRI_BT470BG: ++ cs = V4L2_COLORSPACE_470_SYSTEM_BG; ++ break; ++ case AVCOL_PRI_SMPTE170M: ++ cs = V4L2_COLORSPACE_SMPTE170M; ++ break; ++ case AVCOL_PRI_SMPTE240M: ++ cs = V4L2_COLORSPACE_SMPTE240M; ++ break; ++ case AVCOL_PRI_BT2020: ++ cs = V4L2_COLORSPACE_BT2020; ++ break; ++ case AVCOL_PRI_SMPTE428: ++ case AVCOL_PRI_SMPTE431: ++ case AVCOL_PRI_SMPTE432: ++ case AVCOL_PRI_EBU3213: ++ case AVCOL_PRI_RESERVED: ++ case AVCOL_PRI_FILM: ++ case AVCOL_PRI_UNSPECIFIED: ++ default: ++ break; ++ } ++ ++ switch (avcs) { ++ case AVCOL_SPC_RGB: ++ cs = V4L2_COLORSPACE_SRGB; ++ break; ++ case AVCOL_SPC_BT709: ++ cs = V4L2_COLORSPACE_REC709; ++ break; ++ case AVCOL_SPC_FCC: ++ cs = V4L2_COLORSPACE_470_SYSTEM_M; ++ break; ++ case AVCOL_SPC_BT470BG: ++ cs = V4L2_COLORSPACE_470_SYSTEM_BG; ++ break; ++ case AVCOL_SPC_SMPTE170M: ++ cs = V4L2_COLORSPACE_SMPTE170M; ++ break; ++ case AVCOL_SPC_SMPTE240M: ++ cs = V4L2_COLORSPACE_SMPTE240M; ++ break; ++ case AVCOL_SPC_BT2020_CL: ++ cs = V4L2_COLORSPACE_BT2020; ++ ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM; ++ break; ++ case AVCOL_SPC_BT2020_NCL: ++ cs = V4L2_COLORSPACE_BT2020; ++ break; ++ default: ++ break; ++ } ++ ++ switch (xfer) { ++ case AVCOL_TRC_BT709: ++ xfer = V4L2_XFER_FUNC_709; ++ break; ++ case AVCOL_TRC_IEC61966_2_1: ++ xfer = V4L2_XFER_FUNC_SRGB; ++ break; ++ case AVCOL_TRC_SMPTE240M: ++ xfer = V4L2_XFER_FUNC_SMPTE240M; ++ break; ++ case AVCOL_TRC_SMPTE2084: ++ xfer = V4L2_XFER_FUNC_SMPTE2084; ++ break; ++ default: ++ break; ++ } ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) { ++ buf->context->format.fmt.pix_mp.colorspace = cs; ++ buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr; ++ buf->context->format.fmt.pix_mp.xfer_func = xfer; ++ } else { ++ buf->context->format.fmt.pix.colorspace = cs; ++ buf->context->format.fmt.pix.ycbcr_enc = ycbcr; ++ buf->context->format.fmt.pix.xfer_func = xfer; ++ } ++} ++ + static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf) + { + enum v4l2_quantization qt; +@@ -134,6 +265,20 @@ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf) + return AVCOL_RANGE_UNSPECIFIED; + } + ++static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr) ++{ ++ const enum v4l2_quantization q = ++ avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE : ++ avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE : ++ V4L2_QUANTIZATION_DEFAULT; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) { ++ buf->context->format.fmt.pix_mp.quantization = q; ++ } else { ++ buf->context->format.fmt.pix.quantization = q; ++ } ++} ++ + static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf) + { + enum v4l2_ycbcr_encoding ycbcr; +@@ -210,73 +355,178 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf) + return AVCOL_TRC_UNSPECIFIED; + } + +-static void v4l2_free_buffer(void *opaque, uint8_t *unused) ++static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf) + { +- V4L2Buffer* avbuf = opaque; +- V4L2m2mContext *s = buf_to_m2mctx(avbuf); ++ return V4L2_FIELD_IS_INTERLACED(buf->buf.field); ++} + +- if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) { +- atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel); ++static int v4l2_buf_is_top_first(const V4L2Buffer * const buf) ++{ ++ return buf->buf.field == V4L2_FIELD_INTERLACED_TB; ++} + +- if (s->reinit) { +- if (!atomic_load(&s->refcount)) +- sem_post(&s->refsync); +- } else { +- if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) { +- /* no need to queue more buffers to the driver */ +- avbuf->status = V4L2BUF_AVAILABLE; +- } +- else if (avbuf->context->streamon) +- ff_v4l2_buffer_enqueue(avbuf); +- } ++static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff) ++{ ++ buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE : ++ is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT; ++} + +- av_buffer_unref(&avbuf->context_ref); ++static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf) ++{ ++ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; ++ AVDRMLayerDescriptor *layer; ++ ++ /* fill the DRM frame descriptor */ ++ drm_desc->nb_objects = avbuf->num_planes; ++ drm_desc->nb_layers = 1; ++ ++ layer = &drm_desc->layers[0]; ++ layer->nb_planes = avbuf->num_planes; ++ ++ for (int i = 0; i < avbuf->num_planes; i++) { ++ layer->planes[i].object_index = i; ++ layer->planes[i].offset = avbuf->plane_info[i].offset; ++ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; + } ++ ++ switch (avbuf->context->av_pix_fmt) { ++ case AV_PIX_FMT_YUYV422: ++ ++ layer->format = DRM_FORMAT_YUYV; ++ layer->nb_planes = 1; ++ ++ break; ++ ++ case AV_PIX_FMT_NV12: ++ case AV_PIX_FMT_NV21: ++ ++ layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ? ++ DRM_FORMAT_NV12 : DRM_FORMAT_NV21; ++ ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 2; ++ ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * ++ avbuf->context->format.fmt.pix.height; ++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; ++ break; ++ ++ case AV_PIX_FMT_YUV420P: ++ ++ layer->format = DRM_FORMAT_YUV420; ++ ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 3; ++ ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * ++ avbuf->context->format.fmt.pix.height; ++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1; ++ ++ layer->planes[2].object_index = 0; ++ layer->planes[2].offset = layer->planes[1].offset + ++ ((avbuf->plane_info[0].bytesperline * ++ avbuf->context->format.fmt.pix.height) >> 2); ++ layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1; ++ break; ++ ++ default: ++ drm_desc->nb_layers = 0; ++ break; ++ } ++ ++ return (uint8_t *) drm_desc; + } + +-static int v4l2_buf_increase_ref(V4L2Buffer *in) ++static void v4l2_free_bufref(void *opaque, uint8_t *data) + { +- V4L2m2mContext *s = buf_to_m2mctx(in); ++ AVBufferRef * bufref = (AVBufferRef *)data; ++ V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data; ++ struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl); + +- if (in->context_ref) +- atomic_fetch_add(&in->context_refcount, 1); +- else { +- in->context_ref = av_buffer_ref(s->self_ref); +- if (!in->context_ref) +- return AVERROR(ENOMEM); ++ if (ctx != NULL) { ++ // Buffer still attached to context ++ V4L2m2mContext *s = buf_to_m2mctx(avbuf); + +- in->context_refcount = 1; +- } ++ ff_mutex_lock(&ctx->lock); + +- in->status = V4L2BUF_RET_USER; +- atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed); ++ ff_v4l2_buffer_set_avail(avbuf); + +- return 0; ++ if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) { ++ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name); ++ /* no need to queue more buffers to the driver */ ++ } ++ else if (ctx->streamon) { ++ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name); ++ avbuf->buf.timestamp.tv_sec = 0; ++ avbuf->buf.timestamp.tv_usec = 0; ++ ff_v4l2_buffer_enqueue(avbuf); // will set to IN_DRIVER ++ } ++ else { ++ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name); ++ } ++ ++ ff_mutex_unlock(&ctx->lock); ++ } ++ ++ ff_weak_link_unlock(avbuf->context_wl); ++ av_buffer_unref(&bufref); + } + +-static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf) ++static inline uint32_t ff_v4l2_buf_len(const struct v4l2_buffer * b, unsigned int i) + { +- int ret; ++ return V4L2_TYPE_IS_MULTIPLANAR(b->type) ? b->m.planes[i].length : b->length; ++} + +- if (plane >= in->num_planes) +- return AVERROR(EINVAL); ++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) ++{ ++ int i, ret; ++ const V4L2m2mContext * const s = buf_to_m2mctx(avbuf); + +- /* even though most encoders return 0 in data_offset encoding vp8 does require this value */ +- *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset, +- in->plane_info[plane].length, v4l2_free_buffer, in, 0); +- if (!*buf) +- return AVERROR(ENOMEM); ++ for (i = 0; i < avbuf->num_planes; i++) { ++ int dma_fd = -1; ++ const uint32_t blen = ff_v4l2_buf_len(&avbuf->buf, i); ++ ++ if (s->db_ctl != NULL) { ++ if ((avbuf->dmabuf[i] = dmabuf_alloc(s->db_ctl, blen)) == NULL) ++ return AVERROR(ENOMEM); ++ dma_fd = dmabuf_fd(avbuf->dmabuf[i]); ++ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) ++ avbuf->buf.m.planes[i].m.fd = dma_fd; ++ else ++ avbuf->buf.m.fd = dma_fd; ++ } ++ else { ++ struct v4l2_exportbuffer expbuf; ++ memset(&expbuf, 0, sizeof(expbuf)); ++ ++ expbuf.index = avbuf->buf.index; ++ expbuf.type = avbuf->buf.type; ++ expbuf.plane = i; ++ ++ ret = ioctl(s->fd, VIDIOC_EXPBUF, &expbuf); ++ if (ret < 0) ++ return AVERROR(errno); ++ dma_fd = expbuf.fd; ++ } + +- ret = v4l2_buf_increase_ref(in); +- if (ret) +- av_buffer_unref(buf); ++ avbuf->drm_frame.objects[i].size = blen; ++ avbuf->drm_frame.objects[i].fd = dma_fd; ++ avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ } + +- return ret; ++ return 0; + } + + static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset) + { + unsigned int bytesused, length; ++ int rv = 0; + + if (plane >= out->num_planes) + return AVERROR(EINVAL); +@@ -284,32 +534,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i + length = out->plane_info[plane].length; + bytesused = FFMIN(size+offset, length); + +- memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset)); +- +- if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { +- out->planes[plane].bytesused = bytesused; +- out->planes[plane].length = length; +- } else { +- out->buf.bytesused = bytesused; +- out->buf.length = length; ++ if (size > length - offset) { ++ size = length - offset; ++ rv = AVERROR(ENOMEM); + } + +- return 0; ++ memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size); ++ ++ set_buf_length(out, plane, bytesused, length); ++ ++ return rv; ++} ++ ++static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf) ++{ ++ AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]); ++ AVBufferRef * newbuf; ++ ++ if (!bufref) ++ return NULL; ++ ++ newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0); ++ if (newbuf == NULL) ++ av_buffer_unref(&bufref); ++ ++ avbuf->status = V4L2BUF_RET_USER; ++ return newbuf; + } + + static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) + { +- int i, ret; ++ int i; + + frame->format = avbuf->context->av_pix_fmt; + +- for (i = 0; i < avbuf->num_planes; i++) { +- ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]); +- if (ret) +- return ret; ++ frame->buf[0] = wrap_avbuf(avbuf); ++ if (frame->buf[0] == NULL) ++ return AVERROR(ENOMEM); + ++ if (buf_to_m2mctx(avbuf)->output_drm) { ++ /* 1. get references to the actual data */ ++ frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf); ++ frame->format = AV_PIX_FMT_DRM_PRIME; ++ frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref); ++ return 0; ++ } ++ ++ ++ /* 1. get references to the actual data */ ++ for (i = 0; i < avbuf->num_planes; i++) { ++ frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset; + frame->linesize[i] = avbuf->plane_info[i].bytesperline; +- frame->data[i] = frame->buf[i]->data; + } + + /* fixup special cases */ +@@ -318,17 +593,17 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) + case AV_PIX_FMT_NV21: + if (avbuf->num_planes > 1) + break; +- frame->linesize[1] = avbuf->plane_info[0].bytesperline; +- frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height; ++ frame->linesize[1] = frame->linesize[0]; ++ frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format); + break; + + case AV_PIX_FMT_YUV420P: + if (avbuf->num_planes > 1) + break; +- frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1; +- frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1; +- frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height; +- frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2); ++ frame->linesize[1] = frame->linesize[0] / 2; ++ frame->linesize[2] = frame->linesize[1]; ++ frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format); ++ frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2; + break; + + default: +@@ -338,68 +613,127 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) + return 0; + } + ++static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h) ++{ ++ if (dst_stride == src_stride && w + 32 >= dst_stride) { ++ memcpy(dst, src, dst_stride * h); ++ } ++ else { ++ while (--h >= 0) { ++ memcpy(dst, src, w); ++ dst += dst_stride; ++ src += src_stride; ++ } ++ } ++} ++ ++static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes) ++{ ++ return i != 0 && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA)); ++} ++ ++static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out) ++{ ++ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0]; ++ ++ if (frame->format != AV_PIX_FMT_DRM_PRIME || !src) ++ return AVERROR(EINVAL); ++ ++ av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF); ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { ++ // Only currently cope with single buffer types ++ if (out->buf.length != 1) ++ return AVERROR_PATCHWELCOME; ++ if (src->nb_objects != 1) ++ return AVERROR(EINVAL); ++ ++ out->planes[0].m.fd = src->objects[0].fd; ++ } ++ else { ++ if (src->nb_objects != 1) ++ return AVERROR(EINVAL); ++ ++ out->buf.m.fd = src->objects[0].fd; ++ } ++ ++ // No need to copy src AVDescriptor and if we did then we may confuse ++ // fd close on free ++ out->ref_buf = av_buffer_ref(frame->buf[0]); ++ ++ return 0; ++} ++ + static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + { +- int i, ret; +- struct v4l2_format fmt = out->context->format; +- int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ? +- fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat; +- int height = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ? +- fmt.fmt.pix_mp.height : fmt.fmt.pix.height; +- int is_planar_format = 0; +- +- switch (pixel_format) { +- case V4L2_PIX_FMT_YUV420M: +- case V4L2_PIX_FMT_YVU420M: +-#ifdef V4L2_PIX_FMT_YUV422M +- case V4L2_PIX_FMT_YUV422M: +-#endif +-#ifdef V4L2_PIX_FMT_YVU422M +- case V4L2_PIX_FMT_YVU422M: +-#endif +-#ifdef V4L2_PIX_FMT_YUV444M +- case V4L2_PIX_FMT_YUV444M: +-#endif +-#ifdef V4L2_PIX_FMT_YVU444M +- case V4L2_PIX_FMT_YVU444M: +-#endif +- case V4L2_PIX_FMT_NV12M: +- case V4L2_PIX_FMT_NV21M: +- case V4L2_PIX_FMT_NV12MT_16X16: +- case V4L2_PIX_FMT_NV12MT: +- case V4L2_PIX_FMT_NV16M: +- case V4L2_PIX_FMT_NV61M: +- is_planar_format = 1; +- } +- +- if (!is_planar_format) { +- const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); +- int planes_nb = 0; +- int offset = 0; +- +- for (i = 0; i < desc->nb_components; i++) +- planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1); +- +- for (i = 0; i < planes_nb; i++) { +- int size, h = height; +- if (i == 1 || i == 2) { ++ int i; ++ int num_planes = 0; ++ int pel_strides[4] = {0}; ++ ++ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); ++ ++ if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) { ++ av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__); ++ return -1; ++ } ++ ++ for (i = 0; i != desc->nb_components; ++i) { ++ if (desc->comp[i].plane >= num_planes) ++ num_planes = desc->comp[i].plane + 1; ++ pel_strides[desc->comp[i].plane] = desc->comp[i].step; ++ } ++ ++ if (out->num_planes > 1) { ++ if (num_planes != out->num_planes) { ++ av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes); ++ return -1; ++ } ++ for (i = 0; i != num_planes; ++i) { ++ int w = frame->width; ++ int h = frame->height; ++ if (is_chroma(desc, i, num_planes)) { ++ w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w); + h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h); + } +- size = frame->linesize[i] * h; +- ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset); +- if (ret) +- return ret; +- offset += size; ++ ++ cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline, ++ frame->data[i], frame->linesize[i], ++ w * pel_strides[i], h); ++ set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length); + } +- return 0; + } ++ else ++ { ++ unsigned int offset = 0; ++ ++ for (i = 0; i != num_planes; ++i) { ++ int w = frame->width; ++ int h = frame->height; ++ int dst_stride = out->plane_info[0].bytesperline; ++ uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset; ++ ++ if (is_chroma(desc, i, num_planes)) { ++ // Is chroma ++ dst_stride >>= desc->log2_chroma_w; ++ offset += dst_stride * (out->context->height >> desc->log2_chroma_h); ++ w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w); ++ h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h); ++ } ++ else { ++ // Is luma or alpha ++ offset += dst_stride * out->context->height; ++ } ++ if (offset > out->plane_info[0].length) { ++ av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length); ++ return -1; ++ } + +- for (i = 0; i < out->num_planes; i++) { +- ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0); +- if (ret) +- return ret; ++ cpy_2d(dst, dst_stride, ++ frame->data[i], frame->linesize[i], ++ w * pel_strides[i], h); ++ } ++ set_buf_length(out, 0, offset, out->plane_info[0].length); + } +- + return 0; + } + +@@ -409,16 +743,31 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + * + ******************************************************************************/ + +-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) ++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts) + { +- v4l2_set_pts(out, frame->pts); +- +- return v4l2_buffer_swframe_to_buf(frame, out); ++ out->buf.flags = frame->key_frame ? ++ (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : ++ (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME); ++ // Beware that colour info is held in format rather than the actual ++ // v4l2 buffer struct so this may not be as useful as you might hope ++ v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc); ++ v4l2_set_color_range(out, frame->color_range); ++ // PTS & interlace are buffer vars ++ if (track_ts) ++ out->buf.timestamp = tv_from_int(track_ts); ++ else ++ v4l2_set_pts(out, frame->pts); ++ v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first); ++ ++ return frame->format == AV_PIX_FMT_DRM_PRIME ? ++ v4l2_buffer_primeframe_to_buf(frame, out) : ++ v4l2_buffer_swframe_to_buf(frame, out); + } + + int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) + { + int ret; ++ V4L2Context * const ctx = avbuf->context; + + av_frame_unref(frame); + +@@ -429,17 +778,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) + + /* 2. get frame information */ + frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME); ++ frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I : ++ (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P : ++ (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B : ++ AV_PICTURE_TYPE_NONE; + frame->color_primaries = v4l2_get_color_primaries(avbuf); + frame->colorspace = v4l2_get_color_space(avbuf); + frame->color_range = v4l2_get_color_range(avbuf); + frame->color_trc = v4l2_get_color_trc(avbuf); + frame->pts = v4l2_get_pts(avbuf); + frame->pkt_dts = AV_NOPTS_VALUE; ++ frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf); ++ frame->top_field_first = v4l2_buf_is_top_first(avbuf); + + /* these values are updated also during re-init in v4l2_process_driver_event */ +- frame->height = avbuf->context->height; +- frame->width = avbuf->context->width; +- frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio; ++ frame->height = ctx->height; ++ frame->width = ctx->width; ++ frame->sample_aspect_ratio = ctx->sample_aspect_ratio; ++ ++ if (ctx->selection.height && ctx->selection.width) { ++ frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0; ++ frame->crop_top = ctx->selection.top < frame->height ? ctx->selection.top : 0; ++ frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ? ++ frame->width - (ctx->selection.left + ctx->selection.width) : 0; ++ frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ? ++ frame->height - (ctx->selection.top + ctx->selection.height) : 0; ++ } + + /* 3. report errors upstream */ + if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) { +@@ -452,15 +816,15 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) + + int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) + { +- int ret; +- + av_packet_unref(pkt); +- ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf); +- if (ret) +- return ret; ++ ++ pkt->buf = wrap_avbuf(avbuf); ++ if (pkt->buf == NULL) ++ return AVERROR(ENOMEM); + + pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused; +- pkt->data = pkt->buf->data; ++ pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset; ++ pkt->flags = 0; + + if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME) + pkt->flags |= AV_PKT_FLAG_KEY; +@@ -475,39 +839,107 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) + return 0; + } + +-int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) ++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out, ++ const void *extdata, size_t extlen, ++ const int64_t timestamp) + { + int ret; + +- ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0); +- if (ret) ++ if (extlen) { ++ ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0); ++ if (ret) ++ return ret; ++ } ++ ++ ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen); ++ if (ret && ret != AVERROR(ENOMEM)) + return ret; + +- v4l2_set_pts(out, pkt->pts); ++ if (timestamp) ++ out->buf.timestamp = tv_from_int(timestamp); ++ else ++ v4l2_set_pts(out, pkt->pts); + +- if (pkt->flags & AV_PKT_FLAG_KEY) +- out->flags = V4L2_BUF_FLAG_KEYFRAME; ++ out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ? ++ (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : ++ (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME); + +- return 0; ++ return ret; ++} ++ ++int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) ++{ ++ return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0); + } + +-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) ++ ++static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data) ++{ ++ V4L2Buffer * const avbuf = (V4L2Buffer *)data; ++ int i; ++ ++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) { ++ struct V4L2Plane_info *p = avbuf->plane_info + i; ++ if (p->mm_addr != NULL) ++ munmap(p->mm_addr, p->length); ++ } ++ ++ if (avbuf->dmabuf[0] == NULL) { ++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { ++ if (avbuf->drm_frame.objects[i].fd != -1) ++ close(avbuf->drm_frame.objects[i].fd); ++ } ++ } ++ else { ++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->dmabuf); ++i) { ++ dmabuf_free(avbuf->dmabuf[i]); ++ } ++ } ++ ++ av_buffer_unref(&avbuf->ref_buf); ++ ++ ff_weak_link_unref(&avbuf->context_wl); ++ ++ av_free(avbuf); ++} ++ ++ ++int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem) + { +- V4L2Context *ctx = avbuf->context; + int ret, i; ++ V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf)); ++ AVBufferRef * bufref; ++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); + +- avbuf->buf.memory = V4L2_MEMORY_MMAP; ++ *pbufref = NULL; ++ if (avbuf == NULL) ++ return AVERROR(ENOMEM); ++ ++ bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0); ++ if (bufref == NULL) { ++ av_free(avbuf); ++ return AVERROR(ENOMEM); ++ } ++ ++ avbuf->context = ctx; ++ avbuf->buf.memory = mem; + avbuf->buf.type = ctx->type; + avbuf->buf.index = index; + ++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { ++ avbuf->drm_frame.objects[i].fd = -1; ++ } ++ ++ avbuf->context_wl = ff_weak_link_ref(ctx->wl_master); ++ + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { + avbuf->buf.length = VIDEO_MAX_PLANES; + avbuf->buf.m.planes = avbuf->planes; + } + +- ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf); ++ ret = ioctl(s->fd, VIDIOC_QUERYBUF, &avbuf->buf); + if (ret < 0) +- return AVERROR(errno); ++ goto fail; + + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { + avbuf->num_planes = 0; +@@ -520,6 +952,8 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) + avbuf->num_planes = 1; + + for (i = 0; i < avbuf->num_planes; i++) { ++ const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP && ++ (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm); + + avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? + ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline : +@@ -527,25 +961,31 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) + + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { + avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length; +- avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, +- PROT_READ | PROT_WRITE, MAP_SHARED, +- buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); ++ avbuf->plane_info[i].offset = avbuf->buf.m.planes[i].data_offset; ++ ++ if (want_mmap) ++ avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, ++ PROT_READ | PROT_WRITE, MAP_SHARED, ++ buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); + } else { + avbuf->plane_info[i].length = avbuf->buf.length; +- avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, +- PROT_READ | PROT_WRITE, MAP_SHARED, +- buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); ++ avbuf->plane_info[i].offset = 0; ++ ++ if (want_mmap) ++ avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, ++ PROT_READ | PROT_WRITE, MAP_SHARED, ++ buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); + } + +- if (avbuf->plane_info[i].mm_addr == MAP_FAILED) +- return AVERROR(ENOMEM); ++ if (avbuf->plane_info[i].mm_addr == MAP_FAILED) { ++ avbuf->plane_info[i].mm_addr = NULL; ++ ret = AVERROR(ENOMEM); ++ goto fail; ++ } + } + + avbuf->status = V4L2BUF_AVAILABLE; + +- if (V4L2_TYPE_IS_OUTPUT(ctx->type)) +- return 0; +- + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { + avbuf->buf.m.planes = avbuf->planes; + avbuf->buf.length = avbuf->num_planes; +@@ -555,20 +995,53 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) + avbuf->buf.length = avbuf->planes[0].length; + } + +- return ff_v4l2_buffer_enqueue(avbuf); ++ if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { ++ if (s->output_drm) { ++ ret = v4l2_buffer_export_drm(avbuf); ++ if (ret) { ++ av_log(logger(avbuf), AV_LOG_ERROR, "Failed to get exported drm handles\n"); ++ goto fail; ++ } ++ } ++ } ++ ++ *pbufref = bufref; ++ return 0; ++ ++fail: ++ av_buffer_unref(&bufref); ++ return ret; + } + + int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) + { + int ret; ++ int qc; + +- avbuf->buf.flags = avbuf->flags; ++ if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) { ++ av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", ++ avbuf->context->name, avbuf->buf.index, ++ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, ++ avbuf->context->q_count); ++ } + + ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf); +- if (ret < 0) +- return AVERROR(errno); ++ if (ret < 0) { ++ int err = errno; ++ av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n", ++ avbuf->context->name, avbuf->buf.index, ++ err, strerror(err)); ++ return AVERROR(err); ++ } + ++ // Lock not wanted - if called from buffer free then lock already obtained ++ qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1; + avbuf->status = V4L2BUF_IN_DRIVER; ++ pthread_cond_broadcast(&avbuf->context->cond); ++ ++ av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", ++ avbuf->context->name, avbuf->buf.index, ++ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc); + + return 0; + } +diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h +index 8dbc7fc104..0bda4dd06b 100644 +--- a/libavcodec/v4l2_buffers.h ++++ b/libavcodec/v4l2_buffers.h +@@ -27,29 +27,44 @@ + #include + #include + ++#include "libavutil/hwcontext_drm.h" + #include "avcodec.h" + + enum V4L2Buffer_status { + V4L2BUF_AVAILABLE, + V4L2BUF_IN_DRIVER, ++ V4L2BUF_IN_USE, + V4L2BUF_RET_USER, + }; + + /** + * V4L2Buffer (wrapper for v4l2_buffer management) + */ ++struct V4L2Context; ++struct ff_weak_link_client; ++struct dmabuf_h; ++ + typedef struct V4L2Buffer { +- /* each buffer needs to have a reference to its context */ ++ /* each buffer needs to have a reference to its context ++ * The pointer is good enough for most operation but once the buffer has ++ * been passed to the user the buffer may become orphaned so for free ops ++ * the weak link must be used to ensure that the context is actually ++ * there ++ */ + struct V4L2Context *context; ++ struct ff_weak_link_client *context_wl; + +- /* This object is refcounted per-plane, so we need to keep track +- * of how many context-refs we are holding. */ +- AVBufferRef *context_ref; +- atomic_uint context_refcount; ++ /* DRM descriptor */ ++ AVDRMFrameDescriptor drm_frame; ++ /* For DRM_PRIME encode - need to keep a ref to the source buffer till we ++ * are done ++ */ ++ AVBufferRef * ref_buf; + + /* keep track of the mmap address and mmap length */ + struct V4L2Plane_info { +- int bytesperline; ++ size_t bytesperline; ++ size_t offset; + void * mm_addr; + size_t length; + } plane_info[VIDEO_MAX_PLANES]; +@@ -60,9 +75,9 @@ typedef struct V4L2Buffer { + struct v4l2_buffer buf; + struct v4l2_plane planes[VIDEO_MAX_PLANES]; + +- int flags; + enum V4L2Buffer_status status; + ++ struct dmabuf_h * dmabuf[VIDEO_MAX_PLANES]; // If externally alloced dmabufs - stash other info here + } V4L2Buffer; + + /** +@@ -98,6 +113,10 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf); + */ + int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out); + ++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out, ++ const void *extdata, size_t extlen, ++ const int64_t timestamp); ++ + /** + * Extracts the data from an AVFrame to a V4L2Buffer + * +@@ -106,7 +125,7 @@ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out); + * + * @returns 0 in case of success, a negative AVERROR code otherwise + */ +-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out); ++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts); + + /** + * Initializes a V4L2Buffer +@@ -116,7 +135,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out); + * + * @returns 0 in case of success, a negative AVERROR code otherwise + */ +-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index); ++int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem); + + /** + * Enqueues a V4L2Buffer +@@ -127,5 +146,12 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index); + */ + int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf); + ++static inline void ++ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf) ++{ ++ avbuf->status = V4L2BUF_AVAILABLE; ++ av_buffer_unref(&avbuf->ref_buf); ++} ++ + + #endif // AVCODEC_V4L2_BUFFERS_H +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index ff1ea8e57b..65b2648557 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -27,11 +27,13 @@ + #include + #include + #include ++#include "libavutil/avassert.h" + #include "libavcodec/avcodec.h" + #include "libavcodec/internal.h" + #include "v4l2_buffers.h" + #include "v4l2_fmt.h" + #include "v4l2_m2m.h" ++#include "weak_link.h" + + struct v4l2_format_update { + uint32_t v4l2_fmt; +@@ -41,26 +43,168 @@ struct v4l2_format_update { + int update_avfmt; + }; + +-static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx) ++ ++static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n) + { +- return V4L2_TYPE_IS_OUTPUT(ctx->type) ? +- container_of(ctx, V4L2m2mContext, output) : +- container_of(ctx, V4L2m2mContext, capture); ++ return (int64_t)n; + } + +-static inline AVCodecContext *logger(V4L2Context *ctx) ++static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts) + { +- return ctx_to_m2mctx(ctx)->avctx; ++ return (unsigned int)pts; + } + +-static inline unsigned int v4l2_get_width(struct v4l2_format *fmt) ++// FFmpeg requires us to propagate a number of vars from the coded pkt into ++// the decoded frame. The only thing that tracks like that in V4L2 stateful ++// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no ++// guarantees about PTS being unique or specified for every frame so replace ++// the supplied PTS with a simple incrementing number and keep a circular ++// buffer of all the things we want preserved (including the original PTS) ++// indexed by the tracking no. ++static int64_t ++xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt) + { +- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; ++ int64_t track_pts; ++ ++ // Avoid 0 ++ if (++x->track_no == 0) ++ x->track_no = 1; ++ ++ track_pts = track_to_pts(avctx, x->track_no); ++ ++ av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no); ++ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ ++ .discard = 0, ++ .pending = 1, ++ .pkt_size = avpkt->size, ++ .pts = avpkt->pts, ++ .dts = avpkt->dts, ++ .reordered_opaque = avctx->reordered_opaque, ++ .pkt_pos = avpkt->pos, ++ .pkt_duration = avpkt->duration, ++ .track_pts = track_pts ++ }; ++ return track_pts; ++} ++ ++static int64_t ++xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame) ++{ ++ int64_t track_pts; ++ ++ // Avoid 0 ++ if (++x->track_no == 0) ++ x->track_no = 1; ++ ++ track_pts = track_to_pts(avctx, x->track_no); ++ ++ av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no); ++ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ ++ .discard = 0, ++ .pending = 1, ++ .pkt_size = 0, ++ .pts = frame->pts, ++ .dts = AV_NOPTS_VALUE, ++ .reordered_opaque = frame->reordered_opaque, ++ .pkt_pos = frame->pkt_pos, ++ .pkt_duration = frame->pkt_duration, ++ .track_pts = track_pts ++ }; ++ return track_pts; + } + +-static inline unsigned int v4l2_get_height(struct v4l2_format *fmt) ++ ++// Returns -1 if we should discard the frame ++static int ++xlat_pts_frame_out(AVCodecContext *const avctx, ++ xlat_track_t * const x, ++ AVFrame *const frame) + { +- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; ++ unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE; ++ V4L2m2mTrackEl *const t = x->track_els + n; ++ if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts) ++ { ++ av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING, ++ "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); ++ frame->pts = AV_NOPTS_VALUE; ++ frame->pkt_dts = AV_NOPTS_VALUE; ++ frame->reordered_opaque = x->last_opaque; ++ frame->pkt_pos = -1; ++ frame->pkt_duration = 0; ++ frame->pkt_size = -1; ++ } ++ else if (!t->discard) ++ { ++ frame->pts = t->pending ? t->pts : AV_NOPTS_VALUE; ++ frame->pkt_dts = t->dts; ++ frame->reordered_opaque = t->reordered_opaque; ++ frame->pkt_pos = t->pkt_pos; ++ frame->pkt_duration = t->pkt_duration; ++ frame->pkt_size = t->pkt_size; ++ ++ x->last_opaque = x->track_els[n].reordered_opaque; ++ if (frame->pts != AV_NOPTS_VALUE) ++ x->last_pts = frame->pts; ++ t->pending = 0; ++ } ++ else ++ { ++ av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); ++ return -1; ++ } ++ ++ av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n", ++ frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n); ++ return 0; ++} ++ ++// Returns -1 if we should discard the frame ++static int ++xlat_pts_pkt_out(AVCodecContext *const avctx, ++ xlat_track_t * const x, ++ AVPacket *const pkt) ++{ ++ unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE; ++ V4L2m2mTrackEl *const t = x->track_els + n; ++ if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts) ++ { ++ av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING, ++ "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts); ++ pkt->pts = AV_NOPTS_VALUE; ++ } ++ else if (!t->discard) ++ { ++ pkt->pts = t->pending ? t->pts : AV_NOPTS_VALUE; ++ ++ x->last_opaque = x->track_els[n].reordered_opaque; ++ if (pkt->pts != AV_NOPTS_VALUE) ++ x->last_pts = pkt->pts; ++ t->pending = 0; ++ } ++ else ++ { ++ av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts); ++ return -1; ++ } ++ ++ // * Would like something much better than this...xlat(offset + out_count)? ++ pkt->dts = pkt->pts; ++ av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n", ++ pkt->pts, t->track_pts, n); ++ return 0; ++} ++ ++ ++static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx) ++{ ++ return V4L2_TYPE_IS_OUTPUT(ctx->type) ? ++ container_of(ctx, V4L2m2mContext, output) : ++ container_of(ctx, V4L2m2mContext, capture); ++} ++ ++static inline AVCodecContext *logger(const V4L2Context *ctx) ++{ ++ return ctx_to_m2mctx(ctx)->avctx; + } + + static AVRational v4l2_get_sar(V4L2Context *ctx) +@@ -81,21 +225,29 @@ static AVRational v4l2_get_sar(V4L2Context *ctx) + return sar; + } + +-static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2) ++static inline int ctx_buffers_alloced(const V4L2Context * const ctx) ++{ ++ return ctx->bufrefs != NULL; ++} ++ ++// Width/Height changed or we don't have an alloc in the first place? ++static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2) + { +- struct v4l2_format *fmt1 = &ctx->format; +- int ret = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? +- fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width || +- fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height +- : +- fmt1->fmt.pix.width != fmt2->fmt.pix.width || +- fmt1->fmt.pix.height != fmt2->fmt.pix.height; ++ const struct v4l2_format *fmt1 = &ctx->format; ++ int ret = !ctx_buffers_alloced(ctx) || ++ (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ++ fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width || ++ fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height ++ : ++ fmt1->fmt.pix.width != fmt2->fmt.pix.width || ++ fmt1->fmt.pix.height != fmt2->fmt.pix.height); + + if (ret) +- av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n", ++ av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n", + ctx->name, +- v4l2_get_width(fmt1), v4l2_get_height(fmt1), +- v4l2_get_width(fmt2), v4l2_get_height(fmt2)); ++ ctx_buffers_alloced(ctx), ++ ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1), ++ ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2)); + + return ret; + } +@@ -153,90 +305,110 @@ static inline void v4l2_save_to_context(V4L2Context* ctx, struct v4l2_format_upd + } + } + +-/** +- * handle resolution change event and end of stream event +- * returns 1 if reinit was successful, negative if it failed +- * returns 0 if reinit was not executed +- */ +-static int v4l2_handle_event(V4L2Context *ctx) ++static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r) + { +- V4L2m2mContext *s = ctx_to_m2mctx(ctx); +- struct v4l2_format cap_fmt = s->capture.format; +- struct v4l2_format out_fmt = s->output.format; +- struct v4l2_event evt = { 0 }; +- int full_reinit, reinit, ret; ++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); ++ struct v4l2_selection selection = { ++ .type = V4L2_BUF_TYPE_VIDEO_CAPTURE, ++ .target = V4L2_SEL_TGT_COMPOSE ++ }; + +- ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt); +- if (ret < 0) { +- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name); +- return 0; +- } ++ memset(r, 0, sizeof(*r)); ++ if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection)) ++ return AVERROR(errno); + +- if (evt.type == V4L2_EVENT_EOS) { +- ctx->done = 1; +- return 0; +- } ++ *r = selection.r; ++ return 0; ++} + +- if (evt.type != V4L2_EVENT_SOURCE_CHANGE) +- return 0; ++static int do_source_change(V4L2m2mContext * const s) ++{ ++ AVCodecContext *const avctx = s->avctx; + +- ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt); +- if (ret) { +- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name); +- return 0; +- } ++ int ret; ++ int reinit; ++ struct v4l2_format cap_fmt = s->capture.format; ++ ++ s->capture.done = 0; + + ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt); + if (ret) { +- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name); ++ av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name); + return 0; + } + +- full_reinit = v4l2_resolution_changed(&s->output, &out_fmt); +- if (full_reinit) { +- s->output.height = v4l2_get_height(&out_fmt); +- s->output.width = v4l2_get_width(&out_fmt); +- s->output.sample_aspect_ratio = v4l2_get_sar(&s->output); +- } ++ get_default_selection(&s->capture, &s->capture.selection); ++ ++ reinit = ctx_resolution_changed(&s->capture, &cap_fmt); ++ if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0) ++ reinit = 1; + +- reinit = v4l2_resolution_changed(&s->capture, &cap_fmt); ++ s->capture.format = cap_fmt; + if (reinit) { +- s->capture.height = v4l2_get_height(&cap_fmt); +- s->capture.width = v4l2_get_width(&cap_fmt); +- s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); ++ s->capture.height = ff_v4l2_get_format_height(&cap_fmt); ++ s->capture.width = ff_v4l2_get_format_width(&cap_fmt); + } + +- if (full_reinit || reinit) +- s->reinit = 1; +- +- if (full_reinit) { +- ret = ff_v4l2_m2m_codec_full_reinit(s); +- if (ret) { +- av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n"); +- return AVERROR(EINVAL); +- } +- goto reinit_run; ++ // If we don't support selection (or it is bust) and we obviously have HD then kludge ++ if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) && ++ (s->capture.height == 1088 && s->capture.width == 1920)) { ++ s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080}; + } + ++ s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); ++ ++ av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n", ++ s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den, ++ s->capture.width, s->capture.height, ++ s->capture.selection.width, s->capture.selection.height, ++ s->capture.selection.left, s->capture.selection.top, reinit); ++ + if (reinit) { +- if (s->avctx) +- ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height); ++ if (avctx) ++ ret = ff_set_dimensions(s->avctx, ++ s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width, ++ s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height); + if (ret < 0) +- av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n"); ++ av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n"); + + ret = ff_v4l2_m2m_codec_reinit(s); + if (ret) { +- av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n"); ++ av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n"); + return AVERROR(EINVAL); + } ++ ++ if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) || ++ s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) { ++ av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n", ++ s->capture.width, s->capture.height, ++ ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format)); ++ return AVERROR(EINVAL); ++ } ++ ++ // Update pixel format - should only actually do something on initial change ++ s->capture.av_pix_fmt = ++ ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO); ++ if (s->output_drm) { ++ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; ++ avctx->sw_pix_fmt = s->capture.av_pix_fmt; ++ } ++ else ++ avctx->pix_fmt = s->capture.av_pix_fmt; ++ + goto reinit_run; + } + +- /* dummy event received */ +- return 0; ++ /* Buffers are OK so just stream off to ack */ ++ av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__); ++ ++ ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); ++ if (ret) ++ av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n"); ++ s->draining = 0; + + /* reinit executed */ + reinit_run: ++ ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON); + return 1; + } + +@@ -280,171 +452,293 @@ static int v4l2_stop_encode(V4L2Context *ctx) + return 0; + } + +-static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) ++// DQ a buffer ++// Amalgamates all the various ways there are of signalling EOS/Event to ++// generate a consistant EPIPE. ++// ++// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped) ++// ++// Returns: ++// 0 Success ++// AVERROR(EPIPE) Nothing more to read ++// AVERROR(ENOSPC) No buffers in Q to put result in ++// * AVERROR(..) ++ ++ static int ++dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf) + { +- struct v4l2_plane planes[VIDEO_MAX_PLANES]; +- struct v4l2_buffer buf = { 0 }; +- V4L2Buffer *avbuf; +- struct pollfd pfd = { +- .events = POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */ +- .fd = ctx_to_m2mctx(ctx)->fd, ++ V4L2m2mContext * const m = ctx_to_m2mctx(ctx); ++ AVCodecContext * const avctx = m->avctx; ++ V4L2Buffer * avbuf; ++ const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type); ++ ++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; ++ ++ struct v4l2_buffer buf = { ++ .type = ctx->type, ++ .memory = V4L2_MEMORY_MMAP, + }; +- int i, ret; + +- if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) { +- for (i = 0; i < ctx->num_buffers; i++) { +- if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER) +- break; +- } +- if (i == ctx->num_buffers) +- av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to " +- "userspace. Increase num_capture_buffers " +- "to prevent device deadlock or dropped " +- "packets/frames.\n"); +- } +- +- /* if we are draining and there are no more capture buffers queued in the driver we are done */ +- if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) { +- for (i = 0; i < ctx->num_buffers; i++) { +- /* capture buffer initialization happens during decode hence +- * detection happens at runtime +- */ +- if (!ctx->buffers) +- break; +- +- if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER) +- goto start; ++ *ppavbuf = NULL; ++ ++ if (ctx->flag_last) ++ return AVERROR(EPIPE); ++ ++ if (is_mp) { ++ buf.length = VIDEO_MAX_PLANES; ++ buf.m.planes = planes; ++ } ++ ++ while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) { ++ const int err = errno; ++ av_assert0(AVERROR(err) < 0); ++ if (err != EINTR) { ++ av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", ++ ctx->name, av_err2str(AVERROR(err))); ++ ++ if (err == EPIPE) ++ ctx->flag_last = 1; ++ ++ return AVERROR(err); + } +- ctx->done = 1; +- return NULL; + } ++ atomic_fetch_sub(&ctx->q_count, 1); ++ ++ avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; ++ ff_v4l2_buffer_set_avail(avbuf); ++ avbuf->buf = buf; ++ if (is_mp) { ++ memcpy(avbuf->planes, planes, sizeof(planes)); ++ avbuf->buf.m.planes = avbuf->planes; ++ } ++ // Done with any attached buffer ++ av_buffer_unref(&avbuf->ref_buf); + +-start: +- if (V4L2_TYPE_IS_OUTPUT(ctx->type)) +- pfd.events = POLLOUT | POLLWRNORM; +- else { +- /* no need to listen to requests for more input while draining */ +- if (ctx_to_m2mctx(ctx)->draining) +- pfd.events = POLLIN | POLLRDNORM | POLLPRI; ++ if (V4L2_TYPE_IS_CAPTURE(ctx->type)) { ++ // Zero length cap buffer return == EOS ++ if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n"); ++ ++ // Must reQ so we don't leak ++ // May not matter if the next thing we do is release all the ++ // buffers but better to be tidy. ++ ff_v4l2_buffer_enqueue(avbuf); ++ ++ ctx->flag_last = 1; ++ return AVERROR(EPIPE); ++ } ++ ++#ifdef V4L2_BUF_FLAG_LAST ++ // If flag_last set then this contains data but is the last frame ++ // so remember that but return OK ++ if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0) ++ ctx->flag_last = 1; ++#endif + } + +- for (;;) { +- ret = poll(&pfd, 1, timeout); +- if (ret > 0) +- break; +- if (errno == EINTR) ++ *ppavbuf = avbuf; ++ return 0; ++} ++ ++/** ++ * handle resolution change event and end of stream event ++ * Expects to be called after the stream has stopped ++ * ++ * returns 1 if reinit was successful, negative if it failed ++ * returns 0 if reinit was not executed ++ */ ++static int ++get_event(V4L2m2mContext * const m) ++{ ++ AVCodecContext * const avctx = m->avctx; ++ struct v4l2_event evt = { 0 }; ++ ++ while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) { ++ const int rv = AVERROR(errno); ++ if (rv == AVERROR(EINTR)) + continue; +- return NULL; ++ if (rv == AVERROR(EAGAIN)) { ++ av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n"); ++ return AVERROR_EOF; ++ } ++ av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv)); ++ return rv; + } + +- /* 0. handle errors */ +- if (pfd.revents & POLLERR) { +- /* if we are trying to get free buffers but none have been queued yet +- no need to raise a warning */ +- if (timeout == 0) { +- for (i = 0; i < ctx->num_buffers; i++) { +- if (ctx->buffers[i].status != V4L2BUF_AVAILABLE) +- av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); +- } +- } +- else +- av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); ++ av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type); + +- return NULL; ++ if (evt.type == V4L2_EVENT_EOS) { ++ av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n"); ++ return AVERROR_EOF; + } + +- /* 1. handle resolution changes */ +- if (pfd.revents & POLLPRI) { +- ret = v4l2_handle_event(ctx); +- if (ret < 0) { +- /* if re-init failed, abort */ +- ctx->done = 1; +- return NULL; +- } +- if (ret) { +- /* if re-init was successful drop the buffer (if there was one) +- * since we had to reconfigure capture (unmap all buffers) +- */ +- return NULL; ++ if (evt.type == V4L2_EVENT_SOURCE_CHANGE) ++ return do_source_change(m); ++ ++ return 0; ++} ++ ++static inline int ++dq_ok(const V4L2Context * const c) ++{ ++ return c->streamon && atomic_load(&c->q_count) != 0; ++} ++ ++// Get a buffer ++// If output then just gets the buffer in the expected way ++// If capture then runs the capture state m/c to deal with res change etc. ++// If return value == 0 then *ppavbuf != NULL ++ ++static int ++get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout) ++{ ++ V4L2m2mContext * const m = ctx_to_m2mctx(ctx); ++ AVCodecContext * const avctx = m->avctx; ++ const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type); ++ ++ const unsigned int poll_cap = (POLLIN | POLLRDNORM); ++ const unsigned int poll_out = (POLLOUT | POLLWRNORM); ++ const unsigned int poll_event = POLLPRI; ++ ++ *ppavbuf = NULL; ++ ++ for (;;) { ++ struct pollfd pfd = { ++ .fd = m->fd, ++ // If capture && stream not started then assume we are waiting for the initial event ++ .events = !is_cap ? poll_out : ++ !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap : ++ poll_event, ++ }; ++ int ret; ++ ++ if (ctx->done) { ++ av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name); ++ return AVERROR_EOF; + } +- } + +- /* 2. dequeue the buffer */ +- if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) { ++ // If capture && timeout == -1 then also wait for rx buffer free ++ if (is_cap && timeout == -1 && dq_ok(&m->output) && !m->draining) ++ pfd.events |= poll_out; + +- if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { +- /* there is a capture buffer ready */ +- if (pfd.revents & (POLLIN | POLLRDNORM)) +- goto dequeue; ++ // If nothing Qed all we will get is POLLERR - avoid that ++ if ((pfd.events == poll_out && !dq_ok(&m->output)) || ++ (pfd.events == poll_cap && !dq_ok(&m->capture)) || ++ (pfd.events == (poll_cap | poll_out) && !dq_ok(&m->capture) && !dq_ok(&m->output))) { ++ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name); ++ return AVERROR(ENOSPC); ++ } + +- /* the driver is ready to accept more input; instead of waiting for the capture +- * buffer to complete we return NULL so input can proceed (we are single threaded) +- */ +- if (pfd.revents & (POLLOUT | POLLWRNORM)) +- return NULL; ++ // Timeout kludged s.t. "forever" eventually gives up & produces logging ++ // If waiting for an event when we have seen a last_frame then we expect ++ // it to be ready already so force a short timeout ++ ret = poll(&pfd, 1, ++ ff_v4l2_ctx_eos(ctx) ? 10 : ++ timeout == -1 ? 3000 : timeout); ++ if (ret < 0) { ++ ret = AVERROR(errno); // Remember errno before logging etc. ++ av_assert0(ret < 0); + } + +-dequeue: +- memset(&buf, 0, sizeof(buf)); +- buf.memory = V4L2_MEMORY_MMAP; +- buf.type = ctx->type; +- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { +- memset(planes, 0, sizeof(planes)); +- buf.length = VIDEO_MAX_PLANES; +- buf.m.planes = planes; ++ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n", ++ ctx->name, ret, timeout, pfd.events, pfd.revents); ++ ++ if (ret < 0) { ++ if (ret == AVERROR(EINTR)) ++ continue; ++ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret)); ++ return ret; + } + +- ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf); +- if (ret) { +- if (errno != EAGAIN) { +- ctx->done = 1; +- if (errno != EPIPE) +- av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", +- ctx->name, av_err2str(AVERROR(errno))); ++ if (ret == 0) { ++ if (timeout == -1) ++ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events); ++ if (ff_v4l2_ctx_eos(ctx)) { ++ av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name); ++ ret = get_event(m); ++ if (ret < 0) { ++ ctx->done = 1; ++ return ret; ++ } + } +- return NULL; ++ return AVERROR(EAGAIN); + } + +- if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) { +- int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ? +- buf.m.planes[0].bytesused : buf.bytesused; +- if (bytesused == 0) { ++ if ((pfd.revents & POLLERR) != 0) { ++ av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name); ++ return AVERROR_UNKNOWN; ++ } ++ ++ if ((pfd.revents & poll_event) != 0) { ++ ret = get_event(m); ++ if (ret < 0) { + ctx->done = 1; +- return NULL; ++ return ret; + } +-#ifdef V4L2_BUF_FLAG_LAST +- if (buf.flags & V4L2_BUF_FLAG_LAST) +- ctx->done = 1; +-#endif ++ continue; + } + +- avbuf = &ctx->buffers[buf.index]; +- avbuf->status = V4L2BUF_AVAILABLE; +- avbuf->buf = buf; +- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { +- memcpy(avbuf->planes, planes, sizeof(planes)); +- avbuf->buf.m.planes = avbuf->planes; ++ if ((pfd.revents & poll_cap) != 0) { ++ ret = dq_buf(ctx, ppavbuf); ++ if (ret == AVERROR(EPIPE)) ++ continue; ++ return ret; + } +- return avbuf; ++ ++ if ((pfd.revents & poll_out) != 0) { ++ if (is_cap) ++ return AVERROR(EAGAIN); ++ return dq_buf(ctx, ppavbuf); ++ } ++ ++ av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents); ++ return AVERROR_UNKNOWN; + } ++} + +- return NULL; ++// Clear out flags and timestamps that should should be set by the user ++// Returns the passed avbuf ++static V4L2Buffer * ++clean_v4l2_buffer(V4L2Buffer * const avbuf) ++{ ++ struct v4l2_buffer *const buf = &avbuf->buf; ++ ++ buf->flags = 0; ++ buf->field = V4L2_FIELD_ANY; ++ buf->timestamp = (struct timeval){0}; ++ buf->timecode = (struct v4l2_timecode){0}; ++ buf->sequence = 0; ++ ++ return avbuf; ++} ++ ++int ++ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1) ++{ ++ V4L2Buffer * avbuf; ++ if (timeout1 != 0) { ++ int rv = get_qbuf(ctx, &avbuf, timeout1); ++ if (rv != 0) ++ return rv; ++ } ++ do { ++ get_qbuf(ctx, &avbuf, 0); ++ } while (avbuf); ++ return 0; + } + + static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) + { +- int timeout = 0; /* return when no more buffers to dequeue */ + int i; + + /* get back as many output buffers as possible */ +- if (V4L2_TYPE_IS_OUTPUT(ctx->type)) { +- do { +- } while (v4l2_dequeue_v4l2buf(ctx, timeout)); +- } ++ if (V4L2_TYPE_IS_OUTPUT(ctx->type)) ++ ff_v4l2_dq_all(ctx, 0); + + for (i = 0; i < ctx->num_buffers; i++) { +- if (ctx->buffers[i].status == V4L2BUF_AVAILABLE) +- return &ctx->buffers[i]; ++ V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; ++ if (avbuf->status == V4L2BUF_AVAILABLE) ++ return clean_v4l2_buffer(avbuf); + } + + return NULL; +@@ -452,25 +746,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) + + static int v4l2_release_buffers(V4L2Context* ctx) + { +- struct v4l2_requestbuffers req = { +- .memory = V4L2_MEMORY_MMAP, +- .type = ctx->type, +- .count = 0, /* 0 -> unmaps buffers from the driver */ +- }; +- int i, j; ++ int i; ++ int ret = 0; ++ const int fd = ctx_to_m2mctx(ctx)->fd; + +- for (i = 0; i < ctx->num_buffers; i++) { +- V4L2Buffer *buffer = &ctx->buffers[i]; ++ // Orphan any buffers in the wild ++ ff_weak_link_break(&ctx->wl_master); ++ ++ if (ctx->bufrefs) { ++ for (i = 0; i < ctx->num_buffers; i++) ++ av_buffer_unref(ctx->bufrefs + i); ++ } ++ ++ if (fd != -1) { ++ struct v4l2_requestbuffers req = { ++ .memory = V4L2_MEMORY_MMAP, ++ .type = ctx->type, ++ .count = 0, /* 0 -> unmap all buffers from the driver */ ++ }; + +- for (j = 0; j < buffer->num_planes; j++) { +- struct V4L2Plane_info *p = &buffer->plane_info[j]; +- if (p->mm_addr && p->length) +- if (munmap(p->mm_addr, p->length) < 0) +- av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno))); ++ while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) { ++ if (errno == EINTR) ++ continue; ++ ++ ret = AVERROR(errno); ++ ++ av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n", ++ ctx->name, av_err2str(AVERROR(errno))); ++ ++ if (ctx_to_m2mctx(ctx)->output_drm) ++ av_log(logger(ctx), AV_LOG_ERROR, ++ "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n" ++ "for all buffers: \n" ++ " 1. drmModeRmFB(..)\n" ++ " 2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n"); + } + } ++ atomic_store(&ctx->q_count, 0); + +- return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req); ++ return ret; + } + + static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt) +@@ -499,6 +813,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm + + static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) + { ++ V4L2m2mContext* s = ctx_to_m2mctx(ctx); ++ V4L2m2mPriv *priv = s->avctx->priv_data; + enum AVPixelFormat pixfmt = ctx->av_pix_fmt; + struct v4l2_fmtdesc fdesc; + int ret; +@@ -512,21 +828,22 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) + return 0; + } + +- for (;;) { ++ for (;; ++fdesc.index) { + ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc); + if (ret) + return AVERROR(EINVAL); + ++ if (priv->pix_fmt != AV_PIX_FMT_NONE) { ++ if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) ++ continue; ++ } ++ + pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO); + ret = v4l2_try_raw_format(ctx, pixfmt); +- if (ret){ +- fdesc.index++; +- continue; ++ if (ret == 0) { ++ *p = pixfmt; ++ return 0; + } +- +- *p = pixfmt; +- +- return 0; + } + + return AVERROR(EINVAL); +@@ -569,30 +886,99 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p) + * + *****************************************************************************/ + ++ ++static void flush_all_buffers_status(V4L2Context* const ctx) ++{ ++ int i; ++ ++ if (!ctx->bufrefs) ++ return; ++ ++ for (i = 0; i < ctx->num_buffers; ++i) { ++ struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; ++ if (buf->status == V4L2BUF_IN_DRIVER) ++ ff_v4l2_buffer_set_avail(buf); ++ } ++ atomic_store(&ctx->q_count, 0); ++} ++ ++static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx) ++{ ++ int i; ++ int rv; ++ ++ if (!ctx->bufrefs) { ++ rv = ff_v4l2_context_init(ctx); ++ if (rv) { ++ av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n"); ++ return rv; ++ } ++ } ++ ++ for (i = 0; i < ctx->num_buffers; ++i) { ++ struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; ++ if (buf->status == V4L2BUF_AVAILABLE) { ++ rv = ff_v4l2_buffer_enqueue(buf); ++ if (rv < 0) ++ return rv; ++ } ++ } ++ return 0; ++} ++ + int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) + { + int type = ctx->type; +- int ret; ++ int ret = 0; ++ AVCodecContext * const avctx = logger(ctx); + +- ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type); +- if (ret < 0) +- return AVERROR(errno); ++ // Avoid doing anything if there is nothing we can do ++ if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon) ++ return 0; + +- ctx->streamon = (cmd == VIDIOC_STREAMON); ++ ff_mutex_lock(&ctx->lock); + +- return 0; ++ if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type)) ++ stuff_all_buffers(avctx, ctx); ++ ++ if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) { ++ const int err = errno; ++ av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name, ++ cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err); ++ ret = AVERROR(err); ++ } ++ else ++ { ++ if (cmd == VIDIOC_STREAMOFF) ++ flush_all_buffers_status(ctx); ++ else ++ ctx->first_buf = 1; ++ ++ ctx->streamon = (cmd == VIDIOC_STREAMON); ++ av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name, ++ cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF"); ++ } ++ ++ // Both stream off & on effectively clear flag_last ++ ctx->flag_last = 0; ++ ++ ff_mutex_unlock(&ctx->lock); ++ ++ return ret; + } + + int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) + { +- V4L2m2mContext *s = ctx_to_m2mctx(ctx); ++ V4L2m2mContext *const s = ctx_to_m2mctx(ctx); ++ AVCodecContext *const avctx = s->avctx; ++ int64_t track_ts; + V4L2Buffer* avbuf; + int ret; + + if (!frame) { + ret = v4l2_stop_encode(ctx); + if (ret) +- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name); ++ av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name); + s->draining= 1; + return 0; + } +@@ -601,23 +987,29 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) + if (!avbuf) + return AVERROR(EAGAIN); + +- ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf); ++ track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame); ++ ++ ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts); + if (ret) + return ret; + + return ff_v4l2_buffer_enqueue(avbuf); + } + +-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) ++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, ++ const void * extdata, size_t extlen) + { + V4L2m2mContext *s = ctx_to_m2mctx(ctx); ++ AVCodecContext *const avctx = s->avctx; + V4L2Buffer* avbuf; + int ret; ++ int64_t track_ts; + + if (!pkt->size) { + ret = v4l2_stop_decode(ctx); ++ // Log but otherwise ignore stop failure + if (ret) +- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name); ++ av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret); + s->draining = 1; + return 0; + } +@@ -626,8 +1018,13 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) + if (!avbuf) + return AVERROR(EAGAIN); + +- ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf); +- if (ret) ++ track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt); ++ ++ ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts); ++ if (ret == AVERROR(ENOMEM)) ++ av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n", ++ __func__, pkt->size, avbuf->planes[0].length); ++ else if (ret) + return ret; + + return ff_v4l2_buffer_enqueue(avbuf); +@@ -635,42 +1032,77 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) + + int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) + { ++ V4L2m2mContext *s = ctx_to_m2mctx(ctx); ++ AVCodecContext *const avctx = s->avctx; + V4L2Buffer *avbuf; ++ int rv; + +- /* +- * timeout=-1 blocks until: +- * 1. decoded frame available +- * 2. an input buffer is ready to be dequeued +- */ +- avbuf = v4l2_dequeue_v4l2buf(ctx, timeout); +- if (!avbuf) { +- if (ctx->done) +- return AVERROR_EOF; ++ do { ++ if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) ++ return rv; ++ if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0) ++ return rv; ++ } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0); + +- return AVERROR(EAGAIN); +- } +- +- return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); ++ return 0; + } + +-int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) ++int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout) + { ++ V4L2m2mContext *s = ctx_to_m2mctx(ctx); ++ AVCodecContext *const avctx = s->avctx; + V4L2Buffer *avbuf; ++ int rv; + +- /* +- * blocks until: +- * 1. encoded packet available +- * 2. an input buffer ready to be dequeued +- */ +- avbuf = v4l2_dequeue_v4l2buf(ctx, -1); +- if (!avbuf) { +- if (ctx->done) +- return AVERROR_EOF; ++ do { ++ if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) ++ return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC ++ if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0) ++ return rv; ++ } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0); + +- return AVERROR(EAGAIN); ++ return 0; ++} ++ ++// Return 0 terminated list of drm fourcc video formats for this context ++// NULL if none found or error ++// Returned list is malloced so must be freed ++uint32_t * ff_v4l2_context_enum_drm_formats(V4L2Context *ctx, unsigned int *pN) ++{ ++ unsigned int i; ++ unsigned int n = 0; ++ unsigned int size = 0; ++ uint32_t * e = NULL; ++ *pN = 0; ++ ++ for (i = 0; i < 1024; ++i) { ++ struct v4l2_fmtdesc fdesc = { ++ .index = i, ++ .type = ctx->type ++ }; ++ ++ if (ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc)) ++ return e; ++ ++ if (n + 1 >= size) { ++ unsigned int newsize = (size == 0) ? 16 : size * 2; ++ uint32_t * t = av_realloc(e, newsize * sizeof(*t)); ++ if (!t) ++ return e; ++ e = t; ++ size = newsize; ++ } ++ ++ e[n] = fdesc.pixelformat; ++ e[++n] = 0; ++ if (pN) ++ *pN = n; + } + +- return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf); ++ // If we've looped 1024 times we are clearly confused ++ *pN = 0; ++ av_free(e); ++ return NULL; + } + + int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) +@@ -702,78 +1134,179 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) + + int ff_v4l2_context_set_format(V4L2Context* ctx) + { +- return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); ++ int ret; ++ ++ ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); ++ if (ret != 0) ++ return ret; ++ ++ // Check returned size against min size and if smaller have another go ++ // Only worry about plane[0] as this is meant to enforce limits for ++ // encoded streams where we might know a bit more about the shape ++ // than the driver ++ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) { ++ if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage) ++ return 0; ++ ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size; ++ } ++ else { ++ if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage) ++ return 0; ++ ctx->format.fmt.pix.sizeimage = ctx->min_buf_size; ++ } ++ ++ ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); ++ return ret; + } + + void ff_v4l2_context_release(V4L2Context* ctx) + { + int ret; + +- if (!ctx->buffers) ++ if (!ctx->bufrefs) + return; + + ret = v4l2_release_buffers(ctx); + if (ret) + av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name); + +- av_freep(&ctx->buffers); ++ av_freep(&ctx->bufrefs); ++ av_buffer_unref(&ctx->frames_ref); ++ ++ ff_mutex_destroy(&ctx->lock); ++ pthread_cond_destroy(&ctx->cond); + } + +-int ff_v4l2_context_init(V4L2Context* ctx) ++ ++static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem) + { +- V4L2m2mContext *s = ctx_to_m2mctx(ctx); ++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); + struct v4l2_requestbuffers req; +- int ret, i; +- +- if (!v4l2_type_supported(ctx)) { +- av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type); +- return AVERROR_PATCHWELCOME; +- } ++ int ret; ++ int i; + +- ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format); +- if (ret) +- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name); ++ av_assert0(ctx->bufrefs == NULL); + + memset(&req, 0, sizeof(req)); +- req.count = ctx->num_buffers; +- req.memory = V4L2_MEMORY_MMAP; ++ req.count = req_buffers; ++ req.memory = mem; + req.type = ctx->type; +- ret = ioctl(s->fd, VIDIOC_REQBUFS, &req); +- if (ret < 0) { +- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno)); +- return AVERROR(errno); ++ while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) { ++ if (errno != EINTR) { ++ ret = AVERROR(errno); ++ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret)); ++ return ret; ++ } + } + + ctx->num_buffers = req.count; +- ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer)); +- if (!ctx->buffers) { ++ ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs)); ++ if (!ctx->bufrefs) { + av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name); +- return AVERROR(ENOMEM); ++ goto fail_release; + } + +- for (i = 0; i < req.count; i++) { +- ctx->buffers[i].context = ctx; +- ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i); +- if (ret < 0) { ++ ctx->wl_master = ff_weak_link_new(ctx); ++ if (!ctx->wl_master) { ++ ret = AVERROR(ENOMEM); ++ goto fail_release; ++ } ++ ++ for (i = 0; i < ctx->num_buffers; i++) { ++ ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem); ++ if (ret) { + av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret)); +- goto error; ++ goto fail_release; + } + } + + av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name, + V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat), + req.count, +- v4l2_get_width(&ctx->format), +- v4l2_get_height(&ctx->format), ++ ff_v4l2_get_format_width(&ctx->format), ++ ff_v4l2_get_format_height(&ctx->format), + V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage, + V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline); + + return 0; + +-error: ++fail_release: + v4l2_release_buffers(ctx); ++ av_freep(&ctx->bufrefs); ++ return ret; ++} ++ ++int ff_v4l2_context_init(V4L2Context* ctx) ++{ ++ struct v4l2_queryctrl qctrl; ++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); ++ int ret; ++ ++ // It is not valid to reinit a context without a previous release ++ av_assert0(ctx->bufrefs == NULL); ++ ++ if (!v4l2_type_supported(ctx)) { ++ av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type); ++ return AVERROR_PATCHWELCOME; ++ } + +- av_freep(&ctx->buffers); ++ ff_mutex_init(&ctx->lock, NULL); ++ pthread_cond_init(&ctx->cond, NULL); ++ atomic_init(&ctx->q_count, 0); ++ ++ if (s->output_drm) { ++ AVHWFramesContext *hwframes; ++ ++ ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref); ++ if (!ctx->frames_ref) { ++ ret = AVERROR(ENOMEM); ++ goto fail_unlock; ++ } ++ ++ hwframes = (AVHWFramesContext*)ctx->frames_ref->data; ++ hwframes->format = AV_PIX_FMT_DRM_PRIME; ++ hwframes->sw_format = ctx->av_pix_fmt; ++ hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width; ++ hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height; ++ ret = av_hwframe_ctx_init(ctx->frames_ref); ++ if (ret < 0) ++ goto fail_unref_hwframes; ++ } ++ ++ ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format); ++ if (ret) { ++ ret = AVERROR(errno); ++ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret)); ++ goto fail_unref_hwframes; ++ } ++ ++ memset(&qctrl, 0, sizeof(qctrl)); ++ qctrl.id = V4L2_CID_MIN_BUFFERS_FOR_OUTPUT; ++ if (ioctl(s->fd, VIDIOC_QUERYCTRL, &qctrl) != 0) { ++ ret = AVERROR(errno); ++ if (ret != AVERROR(EINVAL)) { ++ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_QUERCTRL failed: %s\n", ctx->name, av_err2str(ret)); ++ goto fail_unref_hwframes; ++ } ++ // Control unsupported - set default if wanted ++ if (ctx->num_buffers < 2) ++ ctx->num_buffers = 4; ++ } ++ else { ++ if (ctx->num_buffers < 2) ++ ctx->num_buffers = qctrl.minimum + 2; ++ ctx->num_buffers = av_clip(ctx->num_buffers, qctrl.minimum, qctrl.maximum); ++ } ++ ++ ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem); ++ if (ret < 0) ++ goto fail_unref_hwframes; ++ ++ return 0; + ++fail_unref_hwframes: ++ av_buffer_unref(&ctx->frames_ref); ++fail_unlock: ++ ff_mutex_destroy(&ctx->lock); + return ret; + } +diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h +index 22a9532444..0c8c020be1 100644 +--- a/libavcodec/v4l2_context.h ++++ b/libavcodec/v4l2_context.h +@@ -31,6 +31,7 @@ + #include "libavutil/pixfmt.h" + #include "libavutil/frame.h" + #include "libavutil/buffer.h" ++#include "libavutil/thread.h" + #include "v4l2_buffers.h" + + typedef struct V4L2Context { +@@ -70,28 +71,57 @@ typedef struct V4L2Context { + */ + int width, height; + AVRational sample_aspect_ratio; ++ struct v4l2_rect selection; + + /** +- * Indexed array of V4L2Buffers ++ * If the default size of buffer is less than this then try to ++ * set to this. + */ +- V4L2Buffer *buffers; ++ uint32_t min_buf_size; ++ ++ /** ++ * Indexed array of pointers to V4L2Buffers ++ */ ++ AVBufferRef **bufrefs; + + /** + * Readonly after init. + */ + int num_buffers; + ++ /** ++ * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF ++ */ ++ enum v4l2_memory buf_mem; ++ + /** + * Whether the stream has been started (VIDIOC_STREAMON has been sent). + */ + int streamon; + ++ /* 1st buffer after stream on */ ++ int first_buf; ++ + /** + * Either no more buffers available or an unrecoverable error was notified + * by the V4L2 kernel driver: once set the context has to be exited. + */ + int done; + ++ int flag_last; ++ ++ /** ++ * If NZ then when Qing frame/pkt use this rather than the ++ * "real" PTS ++ */ ++ uint64_t track_ts; ++ ++ AVBufferRef *frames_ref; ++ atomic_int q_count; ++ struct ff_weak_link_master *wl_master; ++ ++ AVMutex lock; ++ pthread_cond_t cond; + } V4L2Context; + + /** +@@ -119,6 +149,19 @@ int ff_v4l2_context_set_format(V4L2Context* ctx); + */ + int ff_v4l2_context_get_format(V4L2Context* ctx, int probe); + ++/** ++ * Get the list of drm fourcc pixel formats for this context ++ * ++ * @param[in] ctx A pointer to a V4L2Context. See V4L2Context ++ * description for required variables. ++ * @param[in] pN A pointer to receive the number of formats ++ * found. May be NULL if not wanted. ++ * @return Pointer to malloced list of zero terminated formats, ++ * NULL if none or error. As list is malloced it must be ++ * freed. ++ */ ++uint32_t * ff_v4l2_context_enum_drm_formats(V4L2Context *ctx, unsigned int *pN); ++ + /** + * Releases a V4L2Context. + * +@@ -147,7 +190,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd); + * @param[inout] pkt The AVPacket to dequeue to. + * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. + */ +-int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); ++int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout); + + /** + * Dequeues a buffer from a V4L2Context to an AVFrame. +@@ -156,7 +199,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); + * @param[in] ctx The V4L2Context to dequeue from. + * @param[inout] f The AVFrame to dequeue to. + * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds) ++ * + * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. ++ * AVERROR(ENOSPC) if no buffer availible to put ++ * the frame in + */ + int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); + +@@ -170,7 +216,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); + * @param[in] pkt A pointer to an AVPacket. + * @return 0 in case of success, a negative error otherwise. + */ +-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt); ++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size); + + /** + * Enqueues a buffer to a V4L2Context from an AVFrame +@@ -183,4 +229,28 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt); + */ + int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f); + ++/** ++ * Dequeue all buffers on this queue ++ * ++ * Used to recycle output buffers ++ * ++ * @param[in] ctx The V4L2Context to dequeue from. ++ * @param[in] timeout1 A timeout on dequeuing the 1st buffer, ++ * all others have a timeout of zero ++ * @return AVERROR(EAGAIN) if timeout1 non-zero then the return ++ * of the first dequeue operation, 0 otherwise. ++ */ ++int ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1); ++ ++/** ++ * Returns the number of buffers currently queued ++ * ++ * @param[in] ctx The V4L2Context to evaluate ++ */ ++static inline int ++ff_v4l2_context_q_count(const V4L2Context* const ctx) ++{ ++ return atomic_load(&ctx->q_count); ++} ++ + #endif // AVCODEC_V4L2_CONTEXT_H +diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c +index cdfd579810..143656e792 100644 +--- a/libavcodec/v4l2_m2m.c ++++ b/libavcodec/v4l2_m2m.c +@@ -35,6 +35,15 @@ + #include "v4l2_context.h" + #include "v4l2_fmt.h" + #include "v4l2_m2m.h" ++#include "v4l2_req_dmabufs.h" ++ ++static void ++xlat_init(xlat_track_t * const x) ++{ ++ memset(x, 0, sizeof(*x)); ++ x->last_pts = AV_NOPTS_VALUE; ++} ++ + + static inline int v4l2_splane_video(struct v4l2_capability *cap) + { +@@ -68,7 +77,9 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe) + + s->capture.done = s->output.done = 0; + s->capture.name = "capture"; ++ s->capture.buf_mem = s->db_ctl != NULL ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; + s->output.name = "output"; ++ s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; + atomic_init(&s->refcount, 0); + sem_init(&s->refsync, 0, 0); + +@@ -85,18 +96,58 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe) + if (v4l2_mplane_video(&cap)) { + s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; + s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; ++ s->output.format.type = s->output.type; + return 0; + } + + if (v4l2_splane_video(&cap)) { + s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ s->output.format.type = s->output.type; + return 0; + } + + return AVERROR(EINVAL); + } + ++static int check_size(AVCodecContext * const avctx, V4L2m2mContext * const s) ++{ ++ struct v4l2_format fmt = {.type = s->output.type}; ++ int rv; ++ uint32_t pixfmt = ff_v4l2_format_avfmt_to_v4l2(avctx->pix_fmt); ++ unsigned int w; ++ unsigned int h; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) { ++ fmt.fmt.pix_mp.pixelformat = pixfmt; ++ fmt.fmt.pix_mp.width = avctx->width; ++ fmt.fmt.pix_mp.height = avctx->height; ++ } ++ else { ++ fmt.fmt.pix.pixelformat = pixfmt; ++ fmt.fmt.pix.width = avctx->width; ++ fmt.fmt.pix.height = avctx->height; ++ } ++ ++ rv = ioctl(s->fd, VIDIOC_TRY_FMT, &fmt); ++ ++ if (rv != 0) { ++ rv = AVERROR(errno); ++ av_log(avctx, AV_LOG_ERROR, "%s: Tryfmt failed: %s\n", __func__, av_err2str(rv)); ++ return rv; ++ } ++ ++ w = ff_v4l2_get_format_width(&fmt); ++ h = ff_v4l2_get_format_height(&fmt); ++ ++ if (w < avctx->width || h < avctx->height) { ++ av_log(avctx, AV_LOG_WARNING, "%s: Size check failed: asked for %dx%d, got: %dx%d\n", __func__, avctx->width, avctx->height, w, h); ++ return AVERROR(EINVAL); ++ } ++ ++ return 0; ++} ++ + static int v4l2_probe_driver(V4L2m2mContext *s) + { + void *log_ctx = s->avctx; +@@ -116,6 +167,11 @@ static int v4l2_probe_driver(V4L2m2mContext *s) + goto done; + } + ++ // If being given frames (encode) check that V4L2 can cope with the size ++ if (s->output.av_codec_id == AV_CODEC_ID_RAWVIDEO && ++ (ret = check_size(s->avctx, s)) != 0) ++ goto done; ++ + ret = ff_v4l2_context_get_format(&s->capture, 1); + if (ret) { + av_log(log_ctx, AV_LOG_DEBUG, "v4l2 capture format not supported\n"); +@@ -215,13 +271,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s) + av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n"); + + /* 2. unmap the capture buffers (v4l2 and ffmpeg): +- * we must wait for all references to be released before being allowed +- * to queue new buffers. + */ +- av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n"); +- if (atomic_load(&s->refcount)) +- while(sem_wait(&s->refsync) == -1 && errno == EINTR); +- + ff_v4l2_context_release(&s->capture); + + /* 3. get the new capture format */ +@@ -240,7 +290,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s) + + /* 5. complete reinit */ + s->draining = 0; +- s->reinit = 0; + + return 0; + } +@@ -274,7 +323,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *s) + + /* start again now that we know the stream dimensions */ + s->draining = 0; +- s->reinit = 0; + + ret = ff_v4l2_context_get_format(&s->output, 0); + if (ret) { +@@ -328,10 +376,14 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context) + ff_v4l2_context_release(&s->capture); + sem_destroy(&s->refsync); + +- close(s->fd); ++ if (s->fd != -1) ++ close(s->fd); + av_frame_unref(s->frame); + av_frame_free(&s->frame); + av_packet_unref(&s->buf_pkt); ++ av_freep(&s->extdata_data); ++ ++ av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n"); + + av_free(s); + } +@@ -344,6 +396,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) + if (!s) + return 0; + ++ av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n"); ++ ++ if (s->avctx && av_codec_is_decoder(s->avctx->codec)) ++ av_packet_unref(&s->buf_pkt); ++ + if (s->fd >= 0) { + ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF); + if (ret) +@@ -355,8 +412,20 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) + } + + ff_v4l2_context_release(&s->output); ++ av_buffer_unref(&s->device_ref); ++ ++ dmabufs_ctl_unref(&s->db_ctl); ++ ++ if (s->fd != -1) { ++ close(s->fd); ++ s->fd = -1; ++ } + + s->self_ref = NULL; ++ // This is only called on avctx close so after this point we don't have that ++ // Crash sooner if we find we are using it (can still log with avctx = NULL) ++ s->avctx = NULL; ++ priv->context = NULL; + av_buffer_unref(&priv->context_ref); + + return 0; +@@ -400,35 +469,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv) + return v4l2_configure_contexts(s); + } + +-int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s) ++int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps) + { +- *s = av_mallocz(sizeof(V4L2m2mContext)); +- if (!*s) ++ V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext)); ++ ++ *pps = NULL; ++ if (!s) + return AVERROR(ENOMEM); + +- priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext), ++ priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s), + &v4l2_m2m_destroy_context, NULL, 0); + if (!priv->context_ref) { +- av_freep(s); ++ av_free(s); + return AVERROR(ENOMEM); + } + + /* assign the context */ +- priv->context = *s; +- (*s)->priv = priv; ++ priv->context = s; ++ s->priv = priv; + + /* populate it */ +- priv->context->capture.num_buffers = priv->num_capture_buffers; +- priv->context->output.num_buffers = priv->num_output_buffers; +- priv->context->self_ref = priv->context_ref; +- priv->context->fd = -1; ++ s->capture.num_buffers = priv->num_capture_buffers; ++ s->output.num_buffers = priv->num_output_buffers; ++ s->self_ref = priv->context_ref; ++ s->fd = -1; ++ xlat_init(&s->xlat); + + priv->context->frame = av_frame_alloc(); + if (!priv->context->frame) { + av_buffer_unref(&priv->context_ref); +- *s = NULL; /* freed when unreferencing context_ref */ + return AVERROR(ENOMEM); + } + ++ *pps = s; + return 0; + } +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index b67b216331..a506e69d67 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -30,6 +30,7 @@ + #include + + #include "libavcodec/avcodec.h" ++#include "libavutil/pixfmt.h" + #include "v4l2_context.h" + + #define container_of(ptr, type, member) ({ \ +@@ -38,7 +39,39 @@ + + #define V4L_M2M_DEFAULT_OPTS \ + { "num_output_buffers", "Number of buffers in the output context",\ +- OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS } ++ OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS } ++ ++#define FF_V4L2_M2M_TRACK_SIZE 128 ++typedef struct V4L2m2mTrackEl { ++ int discard; // If we see this buffer its been flushed, so discard ++ int pending; ++ int pkt_size; ++ int64_t pts; ++ int64_t dts; ++ int64_t reordered_opaque; ++ int64_t pkt_pos; ++ int64_t pkt_duration; ++ int64_t track_pts; ++} V4L2m2mTrackEl; ++ ++typedef struct pts_stats_s ++{ ++ void * logctx; ++ const char * name; // For debug ++ unsigned int last_count; ++ unsigned int last_interval; ++ int64_t last_pts; ++ int64_t guess; ++} pts_stats_t; ++ ++typedef struct xlat_track_s { ++ unsigned int track_no; ++ int64_t last_pts; // Last valid PTS decoded ++ int64_t last_opaque; ++ V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; ++} xlat_track_t; ++ ++struct dmabufs_ctl; + + typedef struct V4L2m2mContext { + char devname[PATH_MAX]; +@@ -52,10 +85,10 @@ typedef struct V4L2m2mContext { + AVCodecContext *avctx; + sem_t refsync; + atomic_uint refcount; +- int reinit; + + /* null frame/packet received */ + int draining; ++ int running; + AVPacket buf_pkt; + + /* Reference to a frame. Only used during encoding */ +@@ -66,6 +99,36 @@ typedef struct V4L2m2mContext { + + /* reference back to V4L2m2mPriv */ + void *priv; ++ ++ AVBufferRef *device_ref; ++ ++ /* generate DRM frames */ ++ int output_drm; ++ ++ /* input frames are drmprime */ ++ int input_drm; ++ ++ /* Frame tracking */ ++ xlat_track_t xlat; ++ ++ pts_stats_t pts_stat; ++ ++ /* req pkt */ ++ int req_pkt; ++ int reorder_size; ++ ++ /* Ext data sent */ ++ int extdata_sent; ++ /* Ext data sent in packet - overrides ctx */ ++ void * extdata_data; ++ size_t extdata_size; ++ ++#define FF_V4L2_QUIRK_REINIT_ALWAYS 1 ++#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN 2 ++ /* Quirks */ ++ unsigned int quirks; ++ ++ struct dmabufs_ctl * db_ctl; + } V4L2m2mContext; + + typedef struct V4L2m2mPriv { +@@ -76,6 +139,8 @@ typedef struct V4L2m2mPriv { + + int num_output_buffers; + int num_capture_buffers; ++ const char * dmabuf_alloc; ++ enum AVPixelFormat pix_fmt; + } V4L2m2mPriv; + + /** +@@ -129,4 +194,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx); + */ + int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx); + ++ ++static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; ++} ++ ++static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; ++} ++ ++static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat; ++} ++ ++static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx) ++{ ++ return ctx->flag_last; ++} ++ ++ + #endif /* AVCODEC_V4L2_M2M_H */ +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index ab07c0a24a..e7fd8980e5 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -21,8 +21,14 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + ++#include "config.h" ++ + #include + #include ++ ++#include "libavutil/avassert.h" ++#include "libavutil/hwcontext.h" ++#include "libavutil/hwcontext_drm.h" + #include "libavutil/pixfmt.h" + #include "libavutil/pixdesc.h" + #include "libavutil/opt.h" +@@ -30,75 +36,279 @@ + #include "libavcodec/decode.h" + #include "libavcodec/internal.h" + ++#include "libavcodec/hwaccels.h" ++#include "libavcodec/internal.h" ++#include "libavcodec/hwconfig.h" ++ + #include "v4l2_context.h" + #include "v4l2_m2m.h" + #include "v4l2_fmt.h" ++#include "v4l2_req_dmabufs.h" + +-static int v4l2_try_start(AVCodecContext *avctx) ++#if CONFIG_H264_DECODER ++#include "h264_parse.h" ++#endif ++#if CONFIG_HEVC_DECODER ++#include "hevc_parse.h" ++#endif ++ ++// Pick 64 for max last count - that is >1sec at 60fps ++#define STATS_LAST_COUNT_MAX 64 ++#define STATS_INTERVAL_MAX (1 << 30) ++ ++#ifndef FF_API_BUFFER_SIZE_T ++#define FF_API_BUFFER_SIZE_T 1 ++#endif ++ ++#define DUMP_FAILED_EXTRADATA 0 ++ ++#if DUMP_FAILED_EXTRADATA ++static inline char hex1(unsigned int x) + { +- V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; +- V4L2Context *const capture = &s->capture; +- V4L2Context *const output = &s->output; +- struct v4l2_selection selection = { 0 }; +- int ret; ++ x &= 0xf; ++ return x <= 9 ? '0' + x : 'a' + x - 10; ++} + +- /* 1. start the output process */ +- if (!output->streamon) { +- ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON); +- if (ret < 0) { +- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n"); +- return ret; +- } ++static inline char * hex2(char * s, unsigned int x) ++{ ++ *s++ = hex1(x >> 4); ++ *s++ = hex1(x); ++ return s; ++} ++ ++static inline char * hex4(char * s, unsigned int x) ++{ ++ s = hex2(s, x >> 8); ++ s = hex2(s, x); ++ return s; ++} ++ ++static inline char * dash2(char * s) ++{ ++ *s++ = '-'; ++ *s++ = '-'; ++ return s; ++} ++ ++static void ++data16(char * s, const unsigned int offset, const uint8_t * m, const size_t len) ++{ ++ size_t i; ++ s = hex4(s, offset); ++ m += offset; ++ for (i = 0; i != 8; ++i) { ++ *s++ = ' '; ++ s = len > i + offset ? hex2(s, *m++) : dash2(s); ++ } ++ *s++ = ' '; ++ *s++ = ':'; ++ for (; i != 16; ++i) { ++ *s++ = ' '; ++ s = len > i + offset ? hex2(s, *m++) : dash2(s); + } ++ *s++ = 0; ++} + +- if (capture->streamon) +- return 0; ++static void ++log_dump(void * logctx, int lvl, const void * const data, const size_t len) ++{ ++ size_t i; ++ for (i = 0; i < len; i += 16) { ++ char buf[80]; ++ data16(buf, i, data, len); ++ av_log(logctx, lvl, "%s\n", buf); ++ } ++} ++#endif + +- /* 2. get the capture format */ +- capture->format.type = capture->type; +- ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format); +- if (ret) { +- av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n"); +- return ret; ++static unsigned int pts_stats_interval(const pts_stats_t * const stats) ++{ ++ return stats->last_interval; ++} ++ ++static int64_t pts_stats_guess(const pts_stats_t * const stats, const int fail_bad_guess) ++{ ++ if (stats->last_count <= 1) ++ return stats->last_pts; ++ if (stats->last_pts == AV_NOPTS_VALUE || ++ fail_bad_guess && (stats->last_interval == 0 || ++ stats->last_count >= STATS_LAST_COUNT_MAX)) ++ return AV_NOPTS_VALUE; ++ return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval; ++} ++ ++static void pts_stats_add(pts_stats_t * const stats, int64_t pts) ++{ ++ if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) { ++ if (stats->last_count < STATS_LAST_COUNT_MAX) ++ ++stats->last_count; ++ return; + } + +- /* 2.1 update the AVCodecContext */ +- avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO); +- capture->av_pix_fmt = avctx->pix_fmt; ++ if (stats->last_pts != AV_NOPTS_VALUE) { ++ const int64_t interval = pts - stats->last_pts; + +- /* 3. set the crop parameters */ +- selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; +- selection.r.height = avctx->coded_height; +- selection.r.width = avctx->coded_width; +- ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection); +- if (!ret) { +- ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection); +- if (ret) { +- av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n"); +- } else { +- av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height); +- /* update the size of the resulting frame */ +- capture->height = selection.r.height; +- capture->width = selection.r.width; ++ if (interval < 0 || interval >= STATS_INTERVAL_MAX || ++ stats->last_count >= STATS_LAST_COUNT_MAX) { ++ if (stats->last_interval != 0) ++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n", ++ __func__, stats->name, interval, stats->last_count); ++ stats->last_interval = 0; ++ } ++ else { ++ const int64_t frame_time = interval / (int64_t)stats->last_count; ++ ++ if (frame_time != stats->last_interval) ++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n", ++ __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time); ++ stats->last_interval = frame_time; + } + } + +- /* 4. init the capture context now that we have the capture format */ +- if (!capture->buffers) { +- ret = ff_v4l2_context_init(capture); +- if (ret) { +- av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n"); +- return AVERROR(ENOMEM); ++ stats->last_pts = pts; ++ stats->last_count = 1; ++} ++ ++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name) ++{ ++ *stats = (pts_stats_t){ ++ .logctx = logctx, ++ .name = name, ++ .last_count = 1, ++ .last_interval = 0, ++ .last_pts = AV_NOPTS_VALUE ++ }; ++} ++ ++// If abdata == NULL then this just counts space required ++// Unpacks avcC if detected ++static int ++h264_xd_copy(const uint8_t * const extradata, const int extrasize, uint8_t * abdata) ++{ ++ const uint8_t * const xdend = extradata + extrasize; ++ const uint8_t * p = extradata; ++ uint8_t * d = abdata; ++ unsigned int n; ++ unsigned int len; ++ const unsigned int hdrlen = 4; ++ unsigned int need_pps = 1; ++ ++ if (extrasize < 8) ++ return AVERROR(EINVAL); ++ ++ if (p[0] == 0 && p[1] == 0) { ++ // Assume a couple of leading zeros are good enough to indicate NAL ++ if (abdata) ++ memcpy(d, p, extrasize); ++ return extrasize; ++ } ++ ++ // avcC starts with a 1 ++ if (p[0] != 1) ++ return AVERROR(EINVAL); ++ ++ p += 5; ++ n = *p++ & 0x1f; ++ ++doxps: ++ while (n--) { ++ if (xdend - p < 2) ++ return AVERROR(EINVAL); ++ len = (p[0] << 8) | p[1]; ++ p += 2; ++ if (xdend - p < (ptrdiff_t)len) ++ return AVERROR(EINVAL); ++ if (abdata) { ++ d[0] = 0; ++ d[1] = 0; ++ d[2] = 0; ++ d[3] = 1; ++ memcpy(d + 4, p, len); + } ++ d += len + hdrlen; ++ p += len; ++ } ++ if (need_pps) { ++ need_pps = 0; ++ if (p >= xdend) ++ return AVERROR(EINVAL); ++ n = *p++; ++ goto doxps; + } + +- /* 5. start the capture process */ +- ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); +- if (ret) { +- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n"); ++ return d - abdata; ++} ++ ++static int ++copy_extradata(AVCodecContext * const avctx, ++ const void * const src_data, const int src_len, ++ void ** const pdst_data, size_t * const pdst_len) ++{ ++ int len; ++ ++ *pdst_len = 0; ++ av_freep(pdst_data); ++ ++ if (avctx->codec_id == AV_CODEC_ID_H264) ++ len = h264_xd_copy(src_data, src_len, NULL); ++ else ++ len = src_len < 0 ? AVERROR(EINVAL) : src_len; ++ ++ // Zero length is OK but we want to stop - -ve is error val ++ if (len <= 0) ++ return len; ++ ++ if ((*pdst_data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL) ++ return AVERROR(ENOMEM); ++ ++ if (avctx->codec_id == AV_CODEC_ID_H264) ++ h264_xd_copy(src_data, src_len, *pdst_data); ++ else ++ memcpy(*pdst_data, src_data, len); ++ *pdst_len = len; ++ ++ return 0; ++} ++ ++ ++ ++static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s) ++{ ++ int ret; ++ struct v4l2_decoder_cmd cmd = { ++ .cmd = V4L2_DEC_CMD_START, ++ .flags = 0, ++ }; ++ ++ if (s->output.streamon) ++ return 0; ++ ++ ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON); ++ if (ret != 0) { ++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret)); + return ret; + } + ++ // STREAMON should do implicit START so this just for those that don't. ++ // It is optional so don't worry if it fails ++ if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) { ++ ret = AVERROR(errno); ++ av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret)); ++ } ++ else { ++ av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n"); ++ } ++ return 0; ++} ++ ++static int v4l2_try_start(AVCodecContext *avctx) ++{ ++ V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context; ++ int ret; ++ ++ /* 1. start the output process */ ++ if ((ret = check_output_streamon(avctx, s)) != 0) ++ return ret; + return 0; + } + +@@ -133,46 +343,822 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s) + return 0; + } + +-static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) ++static void ++set_best_effort_pts(AVCodecContext *const avctx, ++ pts_stats_t * const ps, ++ AVFrame *const frame) ++{ ++ pts_stats_add(ps, frame->pts); ++ ++#if FF_API_PKT_PTS ++FF_DISABLE_DEPRECATION_WARNINGS ++ frame->pkt_pts = frame->pts; ++FF_ENABLE_DEPRECATION_WARNINGS ++#endif ++ frame->best_effort_timestamp = pts_stats_guess(ps, 1); ++ // If we can't guess from just PTS - try DTS ++ if (frame->best_effort_timestamp == AV_NOPTS_VALUE) ++ frame->best_effort_timestamp = frame->pkt_dts; ++ ++ // We can't emulate what s/w does in a useful manner and using the ++ // "correct" answer seems to just confuse things. ++ frame->pkt_dts = frame->pts; ++ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", ++ frame->pts, frame->best_effort_timestamp, frame->pkt_dts); ++} ++ ++static void ++xlat_flush(xlat_track_t * const x) ++{ ++ unsigned int i; ++ // Do not reset track_no - this ensures that any frames left in the decoder ++ // that turn up later get discarded. ++ ++ x->last_pts = AV_NOPTS_VALUE; ++ x->last_opaque = 0; ++ for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) { ++ x->track_els[i].pending = 0; ++ x->track_els[i].discard = 1; ++ } ++} ++ ++static void ++xlat_init(xlat_track_t * const x) ++{ ++ memset(x, 0, sizeof(*x)); ++ xlat_flush(x); ++} ++ ++static int ++xlat_pending(const V4L2m2mContext * const s) ++{ ++ const xlat_track_t *const x = &s->xlat; ++ unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE; ++ int i; ++ const int64_t now = pts_stats_guess(&s->pts_stat, 0); ++ int64_t first_dts = AV_NOPTS_VALUE; ++ int no_dts_count = 0; ++ unsigned int interval = pts_stats_interval(&s->pts_stat); ++ ++ for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) { ++ const V4L2m2mTrackEl * const t = x->track_els + n; ++ ++ if (first_dts == AV_NOPTS_VALUE) ++ if (t->dts == AV_NOPTS_VALUE) ++ ++no_dts_count; ++ else ++ first_dts = t->dts; ++ ++ // Discard only set on never-set or flushed entries ++ // So if we get here we've never successfully decoded a frame so allow ++ // more frames into the buffer before stalling ++ if (t->discard) ++ return i - 16; ++ ++ // If we've got this frame out then everything before this point ++ // must have entered the decoder ++ if (!t->pending) ++ break; ++ ++ // If we've never seen a pts all we can do is count frames ++ if (now == AV_NOPTS_VALUE) ++ continue; ++ ++ if (t->dts != AV_NOPTS_VALUE && now >= t->dts) ++ break; ++ } ++ ++ if (first_dts != AV_NOPTS_VALUE && now != AV_NOPTS_VALUE && interval != 0 && s->reorder_size != 0) { ++ const int iframes = (first_dts - now) / (int)interval; ++ const int t = iframes - s->reorder_size + no_dts_count; ++ ++// av_log(s->avctx, AV_LOG_DEBUG, "Last:%"PRId64", Now:%"PRId64", First:%"PRId64", delta=%"PRId64", frames=%d, nodts=%d\n", ++// x->last_dts, now, first_dts, first_dts - now, iframes, no_dts_count); ++ ++ if (iframes > 0 && iframes < 64 && t < i) { ++ return t; ++ } ++ } ++ ++ return i; ++} ++ ++static inline int stream_started(const V4L2m2mContext * const s) { ++ return s->output.streamon; ++} ++ ++#define NQ_OK 0 ++#define NQ_Q_FULL 1 ++#define NQ_SRC_EMPTY 2 ++#define NQ_NONE 3 ++#define NQ_DRAINING 4 ++#define NQ_DEAD 5 ++ ++#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING) ++#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE) ++ ++// do_not_get If true then no new packet will be got but status will ++// be set appropriately ++ ++// AVERROR_EOF Flushing an already flushed stream ++// -ve Error (all errors except EOF are unexpected) ++// NQ_OK (0) OK ++// NQ_Q_FULL Dst full (retry if we think V4L2 Q has space now) ++// NQ_SRC_EMPTY Src empty (do not retry) ++// NQ_NONE Enqueue not attempted ++// NQ_DRAINING At EOS, dQ dest until EOS there too ++// NQ_DEAD Not running (do not retry, do not attempt capture dQ) ++ ++static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get) + { +- V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; +- V4L2Context *const capture = &s->capture; +- V4L2Context *const output = &s->output; + int ret; + +- if (!s->buf_pkt.size) { +- ret = ff_decode_get_packet(avctx, &s->buf_pkt); +- if (ret < 0 && ret != AVERROR_EOF) ++ // If we don't already have a coded packet - get a new one ++ // We will already have a coded pkt if the output Q was full last time we ++ // tried to Q it ++ if (!s->buf_pkt.size && !do_not_get) { ++ unsigned int i; ++ ++ for (i = 0; i < 256; ++i) { ++ uint8_t * side_data; ++#if FF_API_BUFFER_SIZE_T ++ int side_size; ++#else ++ size_t side_size; ++#endif ++ ret = ff_decode_get_packet(avctx, &s->buf_pkt); ++ if (ret != 0) ++ break; ++ ++ // New extradata is the only side-data we undertand ++ side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size); ++ if (side_data) { ++ av_log(avctx, AV_LOG_DEBUG, "New extradata\n"); ++ if ((ret = copy_extradata(avctx, side_data, (int)side_size, &s->extdata_data, &s->extdata_size)) < 0) ++ av_log(avctx, AV_LOG_WARNING, "Failed to copy new extra data: %s\n", av_err2str(ret)); ++ s->extdata_sent = 0; ++ } ++ ++ if (s->buf_pkt.size != 0) ++ break; ++ ++ if (s->buf_pkt.side_data_elems == 0) { ++ av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n"); ++ ret = AVERROR_EOF; ++ break; ++ } ++ ++ // Retry a side-data only pkt ++ } ++ // If i >= 256 something has gone wrong ++ if (i >= 256) { ++ av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n"); ++ return AVERROR(EIO); ++ } ++ ++ if (ret == AVERROR(EAGAIN)) { ++ if (!stream_started(s)) { ++ av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__); ++ return NQ_DEAD; ++ } ++ return NQ_SRC_EMPTY; ++ } ++ ++ if (ret == AVERROR_EOF) { ++ // EOF - enter drain mode ++ av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n", ++ ret, s->buf_pkt.size, stream_started(s), s->draining); ++ if (!stream_started(s)) { ++ av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n"); ++ s->draining = 1; ++ s->capture.done = 1; ++ return AVERROR_EOF; ++ } ++ ++ if (!s->draining) { ++ // Calling enqueue with an empty pkt starts drain ++ av_assert0(s->buf_pkt.size == 0); ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); ++ if (ret) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret); ++ return ret; ++ } ++ } ++ return NQ_DRAINING; ++ } ++ ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret); + return ret; ++ } ++ } ++ ++ if (s->draining) { ++ if (s->buf_pkt.size) { ++ av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n"); ++ av_packet_unref(&s->buf_pkt); ++ } ++ return NQ_DRAINING; + } + +- if (s->draining) +- goto dequeue; ++ if (!s->buf_pkt.size) ++ return NQ_NONE; + +- ret = ff_v4l2_context_enqueue_packet(output, &s->buf_pkt); +- if (ret < 0 && ret != AVERROR(EAGAIN)) +- goto fail; ++ if ((ret = check_output_streamon(avctx, s)) != 0) ++ return ret; ++ ++ if (s->extdata_sent) ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); ++ else ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size); + +- /* if EAGAIN don't unref packet and try to enqueue in the next iteration */ +- if (ret != AVERROR(EAGAIN)) ++ if (ret == AVERROR(EAGAIN)) { ++ // Out of input buffers - keep packet ++ ret = NQ_Q_FULL; ++ } ++ else { ++ // In all other cases we are done with this packet + av_packet_unref(&s->buf_pkt); ++ s->extdata_sent = 1; + +- if (!s->draining) { +- ret = v4l2_try_start(avctx); + if (ret) { +- /* cant recover */ +- if (ret != AVERROR(ENOMEM)) +- ret = 0; +- goto fail; ++ av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret); ++ return ret; + } + } + +-dequeue: +- return ff_v4l2_context_dequeue_frame(capture, frame, -1); +-fail: +- av_packet_unref(&s->buf_pkt); ++ // Start if we haven't ++ { ++ const int ret2 = v4l2_try_start(avctx); ++ if (ret2) { ++ av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2); ++ ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD; ++ } ++ } ++ ++ return ret; ++} ++ ++static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx) ++{ ++ int rv = 0; ++ ++ ff_mutex_lock(&ctx->lock); ++ ++ while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) { ++ if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) { ++ rv = AVERROR(errno); ++ av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv)); ++ break; ++ } ++ } ++ ++ ff_mutex_unlock(&ctx->lock); ++ return rv; ++} ++ ++static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) ++{ ++ V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; ++ int src_rv = -1; ++ int dst_rv = 1; // Non-zero (done), non-negative (error) number ++ unsigned int i = 0; ++ ++ do { ++ const int pending = xlat_pending(s); ++ const int prefer_dq = (pending > 4); ++ const int last_src_rv = src_rv; ++ ++ av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt); ++ ++ // Enqueue another pkt for decode if ++ // (a) We don't have a lot of stuff in the buffer already OR ++ // (b) ... we (think we) do but we've failed to get a frame already OR ++ // (c) We've dequeued a lot of frames without asking for input ++ src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2)); ++ ++ // If we got a frame last time or we've already tried to get a frame and ++ // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN) ++ // indicating that we want more input. ++ // This should mean that once decode starts we enter a stable state where ++ // we alternately ask for input and produce output ++ if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY) ++ break; ++ ++ if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) { ++ av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n"); ++ break; ++ } ++ ++ // Try to get a new frame if ++ // (a) we haven't already got one AND ++ // (b) enqueue returned a status indicating that decode should be attempted ++ if (dst_rv != 0 && TRY_DQ(src_rv)) { ++ // Pick a timeout depending on state ++ // The pending count isn't completely reliable so it is good enough ++ // hint that we want a frame but not good enough to require it in ++ // all cases; however if it has got > 31 that exceeds its margin of ++ // error so require a frame to prevent ridiculous levels of latency ++ const int t = ++ src_rv == NQ_Q_FULL ? -1 : ++ src_rv == NQ_DRAINING ? 300 : ++ prefer_dq ? (s->running && pending > 31 ? 100 : 5) : 0; ++ ++ // Dequeue frame will unref any previous contents of frame ++ // if it returns success so we don't need an explicit unref ++ // when discarding ++ // This returns AVERROR(EAGAIN) on timeout or if ++ // there is room in the input Q and timeout == -1 ++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); ++ ++ // Failure due to no buffer in Q? ++ if (dst_rv == AVERROR(ENOSPC)) { ++ // Wait & retry ++ if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) { ++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); ++ } ++ } ++ ++ if (dst_rv == 0) { ++ set_best_effort_pts(avctx, &s->pts_stat, frame); ++ if (!s->running) { ++ s->running = 1; ++ av_log(avctx, AV_LOG_VERBOSE, "Decode running\n"); ++ } ++ } ++ ++ if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { ++ av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); ++ dst_rv = AVERROR_EOF; ++ s->capture.done = 1; ++ } ++ else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) ++ av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", ++ s->draining, s->capture.done); ++ else if (dst_rv && dst_rv != AVERROR(EAGAIN)) ++ av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n", ++ s->draining, s->capture.done, dst_rv); ++ } ++ ++ ++i; ++ if (i >= 256) { ++ av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i); ++ src_rv = AVERROR(EIO); ++ } ++ ++ // Continue trying to enqueue packets if either ++ // (a) we succeeded last time OR ++ // (b) we didn't ret a frame and we can retry the input ++ } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv))); ++ ++ // Ensure that the frame contains nothing if we aren't returning a frame ++ // (might happen when discarding) ++ if (dst_rv) ++ av_frame_unref(frame); ++ ++ // If we got a frame this time ask for a pkt next time ++ s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0; ++ ++#if 0 ++ if (dst_rv == 0) ++ { ++ static int z = 0; ++ if (++z > 50) { ++ av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n"); ++ ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); ++ return -1; ++ } ++ } ++#endif ++ ++ return dst_rv == 0 ? 0 : ++ src_rv < 0 ? src_rv : ++ dst_rv < 0 ? dst_rv : ++ AVERROR(EAGAIN); ++} ++ ++#if 0 ++#include ++static int64_t us_time(void) ++{ ++ struct timespec ts; ++ clock_gettime(CLOCK_MONOTONIC, &ts); ++ return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000; ++} ++ ++static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) ++{ ++ int ret; ++ const int64_t now = us_time(); ++ int64_t done; ++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ ret = v4l2_receive_frame2(avctx, frame); ++ done = us_time(); ++ av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret); + return ret; + } ++#endif ++ ++static uint32_t ++avprofile_to_v4l2(const enum AVCodecID codec_id, const int avprofile) ++{ ++ switch (codec_id) { ++ case AV_CODEC_ID_H264: ++ switch (avprofile) { ++ case FF_PROFILE_H264_BASELINE: ++ return V4L2_MPEG_VIDEO_H264_PROFILE_BASELINE; ++ case FF_PROFILE_H264_CONSTRAINED_BASELINE: ++ return V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_BASELINE; ++ case FF_PROFILE_H264_MAIN: ++ return V4L2_MPEG_VIDEO_H264_PROFILE_MAIN; ++ case FF_PROFILE_H264_EXTENDED: ++ return V4L2_MPEG_VIDEO_H264_PROFILE_EXTENDED; ++ case FF_PROFILE_H264_HIGH: ++ return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH; ++ case FF_PROFILE_H264_HIGH_10: ++ return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10; ++ case FF_PROFILE_H264_HIGH_10_INTRA: ++ return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10_INTRA; ++ case FF_PROFILE_H264_MULTIVIEW_HIGH: ++ case FF_PROFILE_H264_HIGH_422: ++ return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422; ++ case FF_PROFILE_H264_HIGH_422_INTRA: ++ return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422_INTRA; ++ case FF_PROFILE_H264_STEREO_HIGH: ++ return V4L2_MPEG_VIDEO_H264_PROFILE_STEREO_HIGH; ++ case FF_PROFILE_H264_HIGH_444_PREDICTIVE: ++ return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_PREDICTIVE; ++ case FF_PROFILE_H264_HIGH_444_INTRA: ++ return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_INTRA; ++ case FF_PROFILE_H264_CAVLC_444: ++ return V4L2_MPEG_VIDEO_H264_PROFILE_CAVLC_444_INTRA; ++ case FF_PROFILE_H264_HIGH_444: ++ default: ++ break; ++// V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_BASELINE = 12, ++// V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH = 13, ++// V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH_INTRA = 14, ++// V4L2_MPEG_VIDEO_H264_PROFILE_MULTIVIEW_HIGH = 16, ++// V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_HIGH = 17, ++ } ++ break; ++ case AV_CODEC_ID_MPEG2VIDEO: ++ case AV_CODEC_ID_MPEG4: ++ case AV_CODEC_ID_VC1: ++ case AV_CODEC_ID_VP8: ++ case AV_CODEC_ID_VP9: ++ case AV_CODEC_ID_AV1: ++ // Most profiles are a simple number that matches the V4L2 enum ++ return avprofile; ++ default: ++ break; ++ } ++ return ~(uint32_t)0; ++} ++ ++// This check mirrors Chrome's profile check by testing to see if the profile ++// exists as a possible value for the V4L2 profile control ++static int ++check_profile(AVCodecContext *const avctx, V4L2m2mContext *const s) ++{ ++ struct v4l2_queryctrl query_ctrl; ++ struct v4l2_querymenu query_menu; ++ uint32_t profile_id; ++ ++ // An unset profile is almost certainly zero or -99 - do not reject ++ if (avctx->profile <= 0) { ++ av_log(avctx, AV_LOG_VERBOSE, "Profile %d <= 0 - check skipped\n", avctx->profile); ++ return 0; ++ } ++ ++ memset(&query_ctrl, 0, sizeof(query_ctrl)); ++ switch (avctx->codec_id) { ++ case AV_CODEC_ID_MPEG2VIDEO: ++ profile_id = V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE; ++ break; ++ case AV_CODEC_ID_MPEG4: ++ profile_id = V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE; ++ break; ++ case AV_CODEC_ID_H264: ++ profile_id = V4L2_CID_MPEG_VIDEO_H264_PROFILE; ++ break; ++ case AV_CODEC_ID_VP8: ++ profile_id = V4L2_CID_MPEG_VIDEO_VP8_PROFILE; ++ break; ++ case AV_CODEC_ID_VP9: ++ profile_id = V4L2_CID_MPEG_VIDEO_VP9_PROFILE; ++ break; ++#ifdef V4L2_CID_MPEG_VIDEO_AV1_PROFILE ++ case AV_CODEC_ID_AV1: ++ profile_id = V4L2_CID_MPEG_VIDEO_AV1_PROFILE; ++ break; ++#endif ++ default: ++ av_log(avctx, AV_LOG_VERBOSE, "Can't map profile for codec id %d; profile check skipped\n", avctx->codec_id); ++ return 0; ++ } ++ ++ query_ctrl = (struct v4l2_queryctrl){.id = profile_id}; ++ if (ioctl(s->fd, VIDIOC_QUERYCTRL, &query_ctrl) != 0) { ++ av_log(avctx, AV_LOG_VERBOSE, "Query profile ctrl (%#x) not supported: assume OK\n", query_ctrl.id); ++ } ++ else { ++ av_log(avctx, AV_LOG_DEBUG, "%s: Control supported: %#x\n", __func__, query_ctrl.id); ++ ++ query_menu = (struct v4l2_querymenu){ ++ .id = query_ctrl.id, ++ .index = avprofile_to_v4l2(avctx->codec_id, avctx->profile), ++ }; ++ ++ if (query_menu.index > query_ctrl.maximum || ++ query_menu.index < query_ctrl.minimum || ++ ioctl(s->fd, VIDIOC_QUERYMENU, &query_menu) != 0) { ++ return AVERROR(ENOENT); ++ } ++ } ++ ++ return 0; ++}; ++ ++static int ++check_size(AVCodecContext * const avctx, V4L2m2mContext * const s, const uint32_t fcc) ++{ ++ unsigned int i; ++ const uint32_t w = avctx->coded_width; ++ const uint32_t h = avctx->coded_height; ++ ++ if (w == 0 || h == 0 || fcc == 0) { ++ av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc)); ++ return 0; ++ } ++ if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) { ++ av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc)); ++ return 0; ++ } ++ ++ for (i = 0;; ++i) { ++ struct v4l2_frmsizeenum fs = { ++ .index = i, ++ .pixel_format = fcc, ++ }; ++ ++ while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) { ++ const int err = AVERROR(errno); ++ if (err == AVERROR(EINTR)) ++ continue; ++ if (i == 0 && err == AVERROR(ENOTTY)) { ++ av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n"); ++ return 0; ++ } ++ if (err != AVERROR(EINVAL)) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err)); ++ return err; ++ } ++ av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n", ++ w, h, av_fourcc2str(fcc), i); ++ return err; ++ } ++ ++ switch (fs.type) { ++ case V4L2_FRMSIZE_TYPE_DISCRETE: ++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i, ++ fs.discrete.width,fs.discrete.height); ++ if (w == fs.discrete.width && h == fs.discrete.height) ++ return 0; ++ break; ++ case V4L2_FRMSIZE_TYPE_STEPWISE: ++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i, ++ fs.stepwise.min_width, fs.stepwise.min_height, ++ fs.stepwise.max_width, fs.stepwise.max_height, ++ fs.stepwise.step_width,fs.stepwise.step_height); ++ if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width && ++ h >= fs.stepwise.min_height && h <= fs.stepwise.max_height && ++ (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 && ++ (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0) ++ return 0; ++ break; ++ case V4L2_FRMSIZE_TYPE_CONTINUOUS: ++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i, ++ fs.stepwise.min_width, fs.stepwise.min_height, ++ fs.stepwise.max_width, fs.stepwise.max_height, ++ fs.stepwise.step_width,fs.stepwise.step_height); ++ if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width && ++ h >= fs.stepwise.min_height && h <= fs.stepwise.max_height) ++ return 0; ++ break; ++ default: ++ av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type); ++ return AVERROR(EINVAL); ++ } ++ } ++} ++ ++static int ++get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s) ++{ ++ struct v4l2_capability cap; ++ ++ memset(&cap, 0, sizeof(cap)); ++ while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) { ++ int err = errno; ++ if (err == EINTR) ++ continue; ++ av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err)); ++ return AVERROR(err); ++ } ++ ++ // Could be made table driven if we have a few more but right now there ++ // seems no point ++ ++ // Meson (amlogic) always gives a resolution changed event after output ++ // streamon and userspace must (re)allocate capture buffers and streamon ++ // capture to clear the event even if the capture buffers were the right ++ // size in the first place. ++ if (strcmp(cap.driver, "meson-vdec") == 0) ++ s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN; ++ ++ av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks); ++ return 0; ++} ++ ++// This heuristic is for H264 but use for everything ++static uint32_t max_coded_size(const AVCodecContext * const avctx) ++{ ++ uint32_t wxh = avctx->coded_width * avctx->coded_height; ++ uint32_t size; ++ ++ size = wxh * 3 / 2; ++ // H.264 Annex A table A-1 gives minCR which is either 2 or 4 ++ // unfortunately that doesn't yield an actually useful limit ++ // and it should be noted that frame 0 is special cased to allow ++ // a bigger number which really isn't helpful for us. So just pick ++ // frame_size / 2 ++ size /= 2; ++ // Add 64k to allow for any overheads and/or encoder hopefulness ++ // with small WxH ++ return size + (1 << 16); ++} ++ ++static void ++parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s) ++{ ++ s->reorder_size = 0; ++ ++ if (!avctx->extradata || !avctx->extradata_size) ++ return; ++ ++ switch (avctx->codec_id) { ++#if CONFIG_H264_DECODER ++ case AV_CODEC_ID_H264: ++ { ++ H264ParamSets ps; ++ int is_avc = 0; ++ int nal_length_size = 0; ++ int ret; ++ ++ memset(&ps, 0, sizeof(ps)); ++ ++ ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size, ++ &ps, &is_avc, &nal_length_size, ++ avctx->err_recognition, avctx); ++ if (ret > 0) { ++ const SPS * sps = NULL; ++ unsigned int i; ++ for (i = 0; i != MAX_SPS_COUNT; ++i) { ++ if (ps.sps_list[i]) { ++ sps = (const SPS *)ps.sps_list[i]->data; ++ break; ++ } ++ } ++ if (sps) { ++ avctx->profile = ff_h264_get_profile(sps); ++ avctx->level = sps->level_idc; ++ s->reorder_size = sps->num_reorder_frames; ++ } ++ } ++ ff_h264_ps_uninit(&ps); ++ break; ++ } ++#endif ++#if CONFIG_HEVC_DECODER ++ case AV_CODEC_ID_HEVC: ++ { ++ HEVCParamSets ps; ++ HEVCSEI sei; ++ int is_nalff = 0; ++ int nal_length_size = 0; ++ int ret; ++ ++ memset(&ps, 0, sizeof(ps)); ++ memset(&sei, 0, sizeof(sei)); ++ ++ ret = ff_hevc_decode_extradata(avctx->extradata, avctx->extradata_size, ++ &ps, &sei, &is_nalff, &nal_length_size, ++ avctx->err_recognition, 0, avctx); ++ if (ret > 0) { ++ const HEVCSPS * sps = NULL; ++ unsigned int i; ++ for (i = 0; i != HEVC_MAX_SPS_COUNT; ++i) { ++ if (ps.sps_list[i]) { ++ sps = (const HEVCSPS *)ps.sps_list[i]->data; ++ break; ++ } ++ } ++ if (sps) { ++ avctx->profile = sps->ptl.general_ptl.profile_idc; ++ avctx->level = sps->ptl.general_ptl.level_idc; ++ s->reorder_size = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering; ++ } ++ } ++ ff_hevc_ps_uninit(&ps); ++ ff_hevc_reset_sei(&sei); ++ break; ++ } ++#endif ++ default: ++ break; ++ } ++} ++ ++static int ++choose_capture_format(AVCodecContext * const avctx, V4L2m2mContext * const s) ++{ ++ const V4L2m2mPriv * const priv = avctx->priv_data; ++ unsigned int fmts_n; ++ uint32_t *fmts = ff_v4l2_context_enum_drm_formats(&s->capture, &fmts_n); ++ enum AVPixelFormat *fmts2 = NULL; ++ enum AVPixelFormat t; ++ enum AVPixelFormat gf_pix_fmt; ++ unsigned int i; ++ unsigned int n = 0; ++ unsigned int pref_n = 1; ++ int rv = AVERROR(ENOENT); ++ ++ if (!fmts) ++ return AVERROR(ENOENT); ++ ++ if ((fmts2 = av_malloc(sizeof(*fmts2) * (fmts_n + 2))) == NULL) { ++ rv = AVERROR(ENOMEM); ++ goto error; ++ } ++ ++ // Filter for formats that are supported by ffmpeg and ++ // can accomodate the stream size ++ fmts2[n++] = AV_PIX_FMT_DRM_PRIME; ++ for (i = 0; i != fmts_n; ++i) { ++ const enum AVPixelFormat f = ff_v4l2_format_v4l2_to_avfmt(fmts[i], AV_CODEC_ID_RAWVIDEO); ++ if (f == AV_PIX_FMT_NONE) ++ continue; ++ ++ if (check_size(avctx, s, fmts[i]) != 0) ++ continue; ++ ++ if (f == priv->pix_fmt) ++ pref_n = n; ++ fmts2[n++] = f; ++ } ++ fmts2[n] = AV_PIX_FMT_NONE; ++ ++ if (n < 2) { ++ av_log(avctx, AV_LOG_DEBUG, "%s: No usable formats found\n", __func__); ++ goto error; ++ } ++ ++ // Put preferred s/w format at the end - ff_get_format will put it in sw_pix_fmt ++ t = fmts2[n - 1]; ++ fmts2[n - 1] = fmts2[pref_n]; ++ fmts2[pref_n] = t; ++ ++ gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts); ++ av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n", ++ avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), ++ avctx->coded_width, avctx->coded_height, ++ gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); ++ ++ if (gf_pix_fmt == AV_PIX_FMT_NONE) ++ goto error; ++ ++ if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) { ++ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; ++ s->capture.av_pix_fmt = avctx->sw_pix_fmt; ++ s->output_drm = 1; ++ } ++ else { ++ avctx->pix_fmt = gf_pix_fmt; ++ s->capture.av_pix_fmt = gf_pix_fmt; ++ s->output_drm = 0; ++ } ++ ++ // Get format converts capture.av_pix_fmt back into a V4L2 format in the context ++ if ((rv = ff_v4l2_context_get_format(&s->capture, 0)) != 0) ++ goto error; ++ rv = ff_v4l2_context_set_format(&s->capture); ++ ++error: ++ av_free(fmts2); ++ av_free(fmts); ++ return rv; ++} + + static av_cold int v4l2_decode_init(AVCodecContext *avctx) + { +@@ -181,10 +1167,27 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + V4L2m2mPriv *priv = avctx->priv_data; + int ret; + ++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ ++ if (avctx->codec_id == AV_CODEC_ID_H264) { ++ if (avctx->ticks_per_frame == 1) { ++ if(avctx->time_base.den < INT_MAX/2) { ++ avctx->time_base.den *= 2; ++ } else ++ avctx->time_base.num /= 2; ++ } ++ avctx->ticks_per_frame = 2; ++ } ++ + ret = ff_v4l2_m2m_create_context(priv, &s); + if (ret < 0) + return ret; + ++ parse_extradata(avctx, s); ++ ++ xlat_init(&s->xlat); ++ pts_stats_init(&s->pts_stat, avctx, "decoder"); ++ + capture = &s->capture; + output = &s->output; + +@@ -192,14 +1195,45 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + * by the v4l2 driver; this event will trigger a full pipeline reconfig and + * the proper values will be retrieved from the kernel driver. + */ +- output->height = capture->height = avctx->coded_height; +- output->width = capture->width = avctx->coded_width; ++// output->height = capture->height = avctx->coded_height; ++// output->width = capture->width = avctx->coded_width; ++ output->height = capture->height = 0; ++ output->width = capture->width = 0; + + output->av_codec_id = avctx->codec_id; + output->av_pix_fmt = AV_PIX_FMT_NONE; ++ output->min_buf_size = max_coded_size(avctx); + + capture->av_codec_id = AV_CODEC_ID_RAWVIDEO; + capture->av_pix_fmt = avctx->pix_fmt; ++ capture->min_buf_size = 0; ++ ++ capture->av_pix_fmt = AV_PIX_FMT_NONE; ++ s->output_drm = 0; ++ ++ s->db_ctl = NULL; ++ if (priv->dmabuf_alloc != NULL && strcmp(priv->dmabuf_alloc, "v4l2") != 0) { ++ if (strcmp(priv->dmabuf_alloc, "cma") == 0) ++ s->db_ctl = dmabufs_ctl_new(); ++ else { ++ av_log(avctx, AV_LOG_ERROR, "Unknown dmabuf alloc method: '%s'\n", priv->dmabuf_alloc); ++ return AVERROR(EINVAL); ++ } ++ if (!s->db_ctl) { ++ av_log(avctx, AV_LOG_ERROR, "Can't open dmabuf provider '%s'\n", priv->dmabuf_alloc); ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM); ++ if (!s->device_ref) { ++ ret = AVERROR(ENOMEM); ++ return ret; ++ } ++ ++ ret = av_hwdevice_ctx_init(s->device_ref); ++ if (ret < 0) ++ return ret; + + s->avctx = avctx; + ret = ff_v4l2_m2m_codec_init(priv); +@@ -208,12 +1242,90 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + return ret; + } + +- return v4l2_prepare_decoder(s); ++ if (avctx->extradata && ++ (ret = copy_extradata(avctx, avctx->extradata, avctx->extradata_size, &s->extdata_data, &s->extdata_size)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to copy extradata from context: %s\n", av_err2str(ret)); ++#if DUMP_FAILED_EXTRADATA ++ log_dump(avctx, AV_LOG_INFO, avctx->extradata, avctx->extradata_size); ++#endif ++ return ret; ++ } ++ ++ if ((ret = get_quirks(avctx, s)) != 0) ++ return ret; ++ ++ if ((ret = check_profile(avctx, s)) != 0) { ++ av_log(avctx, AV_LOG_WARNING, "Profile %d not supported by decode\n", avctx->profile); ++ return ret; ++ } ++ ++ // Size check done as part of format filtering ++ if ((ret = choose_capture_format(avctx, s)) != 0) ++ return ret; ++ ++ if ((ret = v4l2_prepare_decoder(s)) < 0) ++ return ret; ++ ++ return 0; + } + + static av_cold int v4l2_decode_close(AVCodecContext *avctx) + { +- return ff_v4l2_m2m_codec_end(avctx->priv_data); ++ int rv; ++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ rv = ff_v4l2_m2m_codec_end(avctx->priv_data); ++ av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv); ++ return rv; ++} ++ ++static void v4l2_decode_flush(AVCodecContext *avctx) ++{ ++ // An alternatve and more drastic form of flush is to simply do this: ++ // v4l2_decode_close(avctx); ++ // v4l2_decode_init(avctx); ++ // The downside is that this keeps a decoder open until all the frames ++ // associated with it have been returned. This is a bit wasteful on ++ // possibly limited h/w resources and fails on a Pi for this reason unless ++ // more GPU mem is allocated than is the default. ++ ++ V4L2m2mPriv * const priv = avctx->priv_data; ++ V4L2m2mContext * const s = priv->context; ++ V4L2Context * const output = &s->output; ++ V4L2Context * const capture = &s->capture; ++ ++ av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon); ++ ++ // Reflushing everything is benign, quick and avoids having to worry about ++ // states like EOS processing so don't try to optimize out (having got it ++ // wrong once) ++ ++ ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF); ++ ++ // Clear any buffered input packet ++ av_packet_unref(&s->buf_pkt); ++ ++ // Clear a pending EOS ++ if (ff_v4l2_ctx_eos(capture)) { ++ // Arguably we could delay this but this is easy and doesn't require ++ // thought or extra vars ++ ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF); ++ ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); ++ } ++ ++ // V4L2 makes no guarantees about whether decoded frames are flushed or not ++ // so mark all frames we are tracking to be discarded if they appear ++ xlat_flush(&s->xlat); ++ ++ // resend extradata ++ s->extdata_sent = 0; ++ // clear status vars ++ s->running = 0; ++ s->draining = 0; ++ output->done = 0; ++ capture->done = 0; ++ ++ // Stream on will occur when we actually submit a new frame ++ av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__); + } + + #define OFFSET(x) offsetof(V4L2m2mPriv, x) +@@ -222,10 +1334,17 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) + static const AVOption options[] = { + V4L_M2M_DEFAULT_OPTS, + { "num_capture_buffers", "Number of buffers in the capture context", +- OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 20, INT_MAX, FLAGS }, ++ OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS }, ++ { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS }, ++ { "dmabuf_alloc", "Dmabuf alloc method", OFFSET(dmabuf_alloc), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS }, + { NULL}, + }; + ++static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = { ++ HW_CONFIG_INTERNAL(DRM_PRIME), ++ NULL ++}; ++ + #define M2MDEC_CLASS(NAME) \ + static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \ + .class_name = #NAME "_v4l2m2m_decoder", \ +@@ -246,9 +1365,15 @@ static const AVOption options[] = { + .init = v4l2_decode_init, \ + .receive_frame = v4l2_receive_frame, \ + .close = v4l2_decode_close, \ ++ .flush = v4l2_decode_flush, \ + .bsfs = bsf_name, \ + .capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \ + .caps_internal = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \ ++ .pix_fmts = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \ ++ AV_PIX_FMT_NV12, \ ++ AV_PIX_FMT_YUV420P, \ ++ AV_PIX_FMT_NONE}, \ ++ .hw_configs = v4l2_m2m_hw_configs, \ + .wrapper_name = "v4l2m2m", \ + } + +diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c +index f644b50133..6472b56030 100644 +--- a/libavcodec/v4l2_m2m_enc.c ++++ b/libavcodec/v4l2_m2m_enc.c +@@ -24,6 +24,8 @@ + #include + #include + #include ++#include ++ + #include "encode.h" + #include "libavcodec/avcodec.h" + #include "libavcodec/internal.h" +@@ -38,6 +40,34 @@ + #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x + #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x + ++// P030 should be defined in drm_fourcc.h and hopefully will be sometime ++// in the future but until then... ++#ifndef DRM_FORMAT_P030 ++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') ++#endif ++ ++#ifndef DRM_FORMAT_NV15 ++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') ++#endif ++ ++#ifndef DRM_FORMAT_NV20 ++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') ++#endif ++ ++#ifndef V4L2_CID_CODEC_BASE ++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE ++#endif ++ ++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined ++// in videodev2.h hopefully will be sometime in the future but until then... ++#ifndef V4L2_PIX_FMT_NV12_10_COL128 ++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') ++#endif ++ ++#ifndef V4L2_PIX_FMT_NV12_COL128 ++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ ++#endif ++ + static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den) + { + struct v4l2_streamparm parm = { 0 }; +@@ -148,15 +178,14 @@ static inline int v4l2_mpeg4_profile_from_ff(int p) + static int v4l2_check_b_frame_support(V4L2m2mContext *s) + { + if (s->avctx->max_b_frames) +- av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n"); ++ av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames); + +- v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0); ++ v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1); + v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0); + if (s->avctx->max_b_frames == 0) + return 0; + + avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding"); +- + return AVERROR_PATCHWELCOME; + } + +@@ -271,17 +300,208 @@ static int v4l2_prepare_encoder(V4L2m2mContext *s) + return 0; + } + ++static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame) ++{ ++ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0]; ++ ++ const uint32_t drm_fmt = src->layers[0].format; ++ // Treat INVALID as LINEAR ++ const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ? ++ DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier; ++ uint32_t pix_fmt = 0; ++ uint32_t w = 0; ++ uint32_t h = 0; ++ uint32_t bpl = src->layers[0].planes[0].pitch; ++ ++ // We really don't expect multiple layers ++ // All formats that we currently cope with are single object ++ ++ if (src->nb_layers != 1 || src->nb_objects != 1) ++ return AVERROR(EINVAL); ++ ++ switch (drm_fmt) { ++ case DRM_FORMAT_YUV420: ++ if (mod == DRM_FORMAT_MOD_LINEAR) { ++ if (src->layers[0].nb_planes != 3) ++ break; ++ pix_fmt = V4L2_PIX_FMT_YUV420; ++ h = src->layers[0].planes[1].offset / bpl; ++ w = bpl; ++ } ++ break; ++ ++ case DRM_FORMAT_NV12: ++ if (mod == DRM_FORMAT_MOD_LINEAR) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12; ++ h = src->layers[0].planes[1].offset / bpl; ++ w = bpl; ++ } ++ else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12_COL128; ++ w = bpl; ++ h = src->layers[0].planes[1].offset / 128; ++ bpl = fourcc_mod_broadcom_param(mod); ++ } ++ break; ++ ++ case DRM_FORMAT_P030: ++ if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12_10_COL128; ++ w = bpl / 2; // Matching lie to how we construct this ++ h = src->layers[0].planes[1].offset / 128; ++ bpl = fourcc_mod_broadcom_param(mod); ++ } ++ break; ++ ++ default: ++ break; ++ } ++ ++ if (!pix_fmt) ++ return AVERROR(EINVAL); ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) { ++ struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp; ++ ++ pix->width = w; ++ pix->height = h; ++ pix->pixelformat = pix_fmt; ++ pix->plane_fmt[0].bytesperline = bpl; ++ pix->num_planes = 1; ++ } ++ else { ++ struct v4l2_pix_format *const pix = &format->fmt.pix; ++ ++ pix->width = w; ++ pix->height = h; ++ pix->pixelformat = pix_fmt; ++ pix->bytesperline = bpl; ++ } ++ ++ return 0; ++} ++ ++// Do we have similar enough formats to be usable? ++static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b) ++{ ++ if (a->type != b->type) ++ return 0; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) { ++ const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp; ++ const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp; ++ unsigned int i; ++ if (pa->pixelformat != pb->pixelformat || ++ pa->num_planes != pb->num_planes) ++ return 0; ++ for (i = 0; i != pa->num_planes; ++i) { ++ if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline) ++ return 0; ++ } ++ } ++ else { ++ const struct v4l2_pix_format *const pa = &a->fmt.pix; ++ const struct v4l2_pix_format *const pb = &b->fmt.pix; ++ if (pa->pixelformat != pb->pixelformat || ++ pa->bytesperline != pb->bytesperline) ++ return 0; ++ } ++ return 1; ++} ++ ++static inline int q_full(const V4L2Context *const output) ++{ ++ return ff_v4l2_context_q_count(output) == output->num_buffers; ++} ++ + static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) + { + V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; + V4L2Context *const output = &s->output; ++ int rv; ++ const int needs_slot = q_full(output); ++ ++ av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot); ++ ++ // Signal EOF if needed (doesn't need q slot) ++ if (!frame) { ++ av_log(avctx, AV_LOG_TRACE, "--- %s: EOS\n", __func__); ++ return ff_v4l2_context_enqueue_frame(output, frame); ++ } ++ ++ if ((rv = ff_v4l2_dq_all(output, needs_slot? 500 : 0)) != 0) { ++ // We should be able to return AVERROR(EAGAIN) to indicate buffer ++ // exhaustion, but ffmpeg currently treats that as fatal. ++ av_log(avctx, AV_LOG_WARNING, "Failed to get buffer for src frame: %s\n", av_err2str(rv)); ++ return rv; ++ } ++ ++ if (s->input_drm && !output->streamon) { ++ struct v4l2_format req_format = {.type = output->format.type}; ++ ++ // Set format when we first get a buffer ++ if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n"); ++ return rv; ++ } ++ ++ ff_v4l2_context_release(output); ++ ++ output->format = req_format; ++ ++ if ((rv = ff_v4l2_context_set_format(output)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n"); ++ return rv; ++ } ++ ++ if (!fmt_eq(&req_format, &output->format)) { ++ av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ output->selection.top = frame->crop_top; ++ output->selection.left = frame->crop_left; ++ output->selection.width = av_frame_cropped_width(frame); ++ output->selection.height = av_frame_cropped_height(frame); ++ ++ if ((rv = ff_v4l2_context_init(output)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n"); ++ return rv; ++ } ++ ++ { ++ struct v4l2_selection selection = { ++ .type = V4L2_BUF_TYPE_VIDEO_OUTPUT, ++ .target = V4L2_SEL_TGT_CROP, ++ .r = output->selection ++ }; ++ if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) { ++ av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n", ++ selection.r.width, selection.r.height, selection.r.left, selection.r.top, ++ av_err2str(AVERROR(errno))); ++ } ++ av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n", ++ selection.r.width, selection.r.height, selection.r.left, selection.r.top); ++ } ++ } + + #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME +- if (frame && frame->pict_type == AV_PICTURE_TYPE_I) ++ if (frame->pict_type == AV_PICTURE_TYPE_I) + v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1); + #endif + +- return ff_v4l2_context_enqueue_frame(output, frame); ++ rv = ff_v4l2_context_enqueue_frame(output, frame); ++ if (rv) { ++ av_log(avctx, AV_LOG_ERROR, "Enqueue frame failed: %s\n", av_err2str(rv)); ++ } ++ ++ return rv; + } + + static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) +@@ -292,6 +512,11 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) + AVFrame *frame = s->frame; + int ret; + ++ av_log(avctx, AV_LOG_TRACE, "<<< %s: qlen out %d cap %d\n", __func__, ++ ff_v4l2_context_q_count(output), ff_v4l2_context_q_count(capture)); ++ ++ ff_v4l2_dq_all(output, 0); ++ + if (s->draining) + goto dequeue; + +@@ -328,7 +553,115 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) + } + + dequeue: +- return ff_v4l2_context_dequeue_packet(capture, avpkt); ++ // Dequeue a frame ++ for (;;) { ++ int t = q_full(output) ? -1 : s->draining ? 300 : 0; ++ int rv2; ++ ++ // If output is full wait for either a packet or output to become not full ++ ret = ff_v4l2_context_dequeue_packet(capture, avpkt, t); ++ ++ // If output was full retry packet dequeue ++ t = (ret != AVERROR(EAGAIN) || t != -1) ? 0 : 300; ++ rv2 = ff_v4l2_dq_all(output, t); ++ if (t == 0 || rv2 != 0) ++ break; ++ } ++ if (ret) ++ return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret; ++ ++ if (capture->first_buf == 1) { ++ uint8_t * data; ++ const int len = avpkt->size; ++ ++ // 1st buffer after streamon should be SPS/PPS ++ capture->first_buf = 2; ++ ++ // Clear both possible stores so there is no chance of confusion ++ av_freep(&s->extdata_data); ++ s->extdata_size = 0; ++ av_freep(&avctx->extradata); ++ avctx->extradata_size = 0; ++ ++ if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL) ++ goto fail_no_mem; ++ ++ memcpy(data, avpkt->data, len); ++ av_packet_unref(avpkt); ++ ++ // We need to copy the header, but keep local if not global ++ if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) { ++ avctx->extradata = data; ++ avctx->extradata_size = len; ++ } ++ else { ++ s->extdata_data = data; ++ s->extdata_size = len; ++ } ++ ++ ret = ff_v4l2_context_dequeue_packet(capture, avpkt, 0); ++ ff_v4l2_dq_all(output, 0); ++ if (ret) ++ return ret; ++ } ++ ++ // First frame must be key so mark as such even if encoder forgot ++ if (capture->first_buf == 2) { ++ avpkt->flags |= AV_PKT_FLAG_KEY; ++ ++ // Add any extradata to the 1st packet we emit as we cannot create it at init ++ if (avctx->extradata_size > 0 && avctx->extradata) { ++ void * const side = av_packet_new_side_data(avpkt, ++ AV_PKT_DATA_NEW_EXTRADATA, ++ avctx->extradata_size); ++ if (!side) ++ goto fail_no_mem; ++ ++ memcpy(side, avctx->extradata, avctx->extradata_size); ++ } ++ } ++ ++ // Add SPS/PPS to the start of every key frame if non-global headers ++ if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) { ++ const size_t newlen = s->extdata_size + avpkt->size; ++ AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE); ++ ++ if (buf == NULL) ++ goto fail_no_mem; ++ ++ memcpy(buf->data, s->extdata_data, s->extdata_size); ++ memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size); ++ ++ av_buffer_unref(&avpkt->buf); ++ avpkt->buf = buf; ++ avpkt->data = buf->data; ++ avpkt->size = newlen; ++ } ++ else if (ff_v4l2_context_q_count(capture) < 2) { ++ // Avoid running out of capture buffers ++ // In most cases the buffers will be returned quickly in which case ++ // we don't copy and can use the v4l2 buffers directly but sometimes ++ // ffmpeg seems to hold onto all of them for a long time (.mkv ++ // creation?) so avoid deadlock in those cases. ++ AVBufferRef * const buf = av_buffer_alloc(avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE); ++ if (buf == NULL) ++ goto fail_no_mem; ++ ++ memcpy(buf->data, avpkt->data, avpkt->size); ++ av_buffer_unref(&avpkt->buf); // Will recycle the V4L2 buffer ++ ++ avpkt->buf = buf; ++ avpkt->data = buf->data; ++ } ++ ++ capture->first_buf = 0; ++ return 0; ++ ++fail_no_mem: ++ av_log(avctx, AV_LOG_ERROR, "Rx pkt failed: No memory\n"); ++ ret = AVERROR(ENOMEM); ++ av_packet_unref(avpkt); ++ return ret; + } + + static av_cold int v4l2_encode_init(AVCodecContext *avctx) +@@ -340,6 +673,8 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx) + uint32_t v4l2_fmt_output; + int ret; + ++ av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt); ++ + ret = ff_v4l2_m2m_create_context(priv, &s); + if (ret < 0) + return ret; +@@ -347,13 +682,17 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx) + capture = &s->capture; + output = &s->output; + ++ s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME); ++ + /* common settings output/capture */ + output->height = capture->height = avctx->height; + output->width = capture->width = avctx->width; + + /* output context */ + output->av_codec_id = AV_CODEC_ID_RAWVIDEO; +- output->av_pix_fmt = avctx->pix_fmt; ++ output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt : ++ avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt : ++ AV_PIX_FMT_YUV420P; + + /* capture context */ + capture->av_codec_id = avctx->codec_id; +@@ -372,7 +711,7 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx) + v4l2_fmt_output = output->format.fmt.pix.pixelformat; + + pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO); +- if (pix_fmt_output != avctx->pix_fmt) { ++ if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) { + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output); + av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name); + return AVERROR(EINVAL); +@@ -390,9 +729,10 @@ static av_cold int v4l2_encode_close(AVCodecContext *avctx) + #define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM + + #define V4L_M2M_CAPTURE_OPTS \ +- V4L_M2M_DEFAULT_OPTS,\ ++ { "num_output_buffers", "Number of buffers in the output context",\ ++ OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },\ + { "num_capture_buffers", "Number of buffers in the capture context", \ +- OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 4 }, 4, INT_MAX, FLAGS } ++ OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 8 }, 8, INT_MAX, FLAGS } + + static const AVOption mpeg4_options[] = { + V4L_M2M_CAPTURE_OPTS, +diff --git a/libavcodec/v4l2_req_decode_q.c b/libavcodec/v4l2_req_decode_q.c +new file mode 100644 +index 0000000000..5b3fb958fa +--- /dev/null ++++ b/libavcodec/v4l2_req_decode_q.c +@@ -0,0 +1,84 @@ ++#include ++#include ++#include ++ ++#include "v4l2_req_decode_q.h" ++ ++int decode_q_in_q(const req_decode_ent * const d) ++{ ++ return d->in_q; ++} ++ ++void decode_q_add(req_decode_q * const q, req_decode_ent * const d) ++{ ++ pthread_mutex_lock(&q->q_lock); ++ if (!q->head) { ++ q->head = d; ++ q->tail = d; ++ d->prev = NULL; ++ } ++ else { ++ q->tail->next = d; ++ d->prev = q->tail; ++ q->tail = d; ++ } ++ d->next = NULL; ++ d->in_q = 1; ++ pthread_mutex_unlock(&q->q_lock); ++} ++ ++// Remove entry from Q - if head wake-up anything that was waiting ++void decode_q_remove(req_decode_q * const q, req_decode_ent * const d) ++{ ++ int try_signal = 0; ++ ++ if (!d->in_q) ++ return; ++ ++ pthread_mutex_lock(&q->q_lock); ++ if (d->prev) ++ d->prev->next = d->next; ++ else { ++ try_signal = 1; // Only need to signal if we were head ++ q->head = d->next; ++ } ++ ++ if (d->next) ++ d->next->prev = d->prev; ++ else ++ q->tail = d->prev; ++ ++ // Not strictly needed but makes debug easier ++ d->next = NULL; ++ d->prev = NULL; ++ d->in_q = 0; ++ pthread_mutex_unlock(&q->q_lock); ++ ++ if (try_signal) ++ pthread_cond_broadcast(&q->q_cond); ++} ++ ++void decode_q_wait(req_decode_q * const q, req_decode_ent * const d) ++{ ++ pthread_mutex_lock(&q->q_lock); ++ ++ while (q->head != d) ++ pthread_cond_wait(&q->q_cond, &q->q_lock); ++ ++ pthread_mutex_unlock(&q->q_lock); ++} ++ ++void decode_q_uninit(req_decode_q * const q) ++{ ++ pthread_mutex_destroy(&q->q_lock); ++ pthread_cond_destroy(&q->q_cond); ++} ++ ++void decode_q_init(req_decode_q * const q) ++{ ++ memset(q, 0, sizeof(*q)); ++ pthread_mutex_init(&q->q_lock, NULL); ++ pthread_cond_init(&q->q_cond, NULL); ++} ++ ++ +diff --git a/libavcodec/v4l2_req_decode_q.h b/libavcodec/v4l2_req_decode_q.h +new file mode 100644 +index 0000000000..af7bbe1de4 +--- /dev/null ++++ b/libavcodec/v4l2_req_decode_q.h +@@ -0,0 +1,25 @@ ++#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H ++#define AVCODEC_V4L2_REQ_DECODE_Q_H ++ ++typedef struct req_decode_ent { ++ struct req_decode_ent * next; ++ struct req_decode_ent * prev; ++ int in_q; ++} req_decode_ent; ++ ++typedef struct req_decode_q { ++ pthread_mutex_t q_lock; ++ pthread_cond_t q_cond; ++ req_decode_ent * head; ++ req_decode_ent * tail; ++} req_decode_q; ++ ++int decode_q_in_q(const req_decode_ent * const d); ++void decode_q_add(req_decode_q * const q, req_decode_ent * const d); ++void decode_q_remove(req_decode_q * const q, req_decode_ent * const d); ++void decode_q_wait(req_decode_q * const q, req_decode_ent * const d); ++void decode_q_uninit(req_decode_q * const q); ++void decode_q_init(req_decode_q * const q); ++ ++#endif ++ +diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c +new file mode 100644 +index 0000000000..ee8527ba1f +--- /dev/null ++++ b/libavcodec/v4l2_req_devscan.c +@@ -0,0 +1,451 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++ ++#include "v4l2_req_devscan.h" ++#include "v4l2_req_utils.h" ++ ++struct decdev { ++ enum v4l2_buf_type src_type; ++ uint32_t src_fmt_v4l2; ++ const char * vname; ++ const char * mname; ++}; ++ ++struct devscan { ++ struct decdev env; ++ unsigned int dev_size; ++ unsigned int dev_count; ++ struct decdev *devs; ++}; ++ ++static int video_src_pixfmt_supported(uint32_t fmt) ++{ ++ return 1; ++} ++ ++static void v4l2_setup_format(struct v4l2_format *format, unsigned int type, ++ unsigned int width, unsigned int height, ++ unsigned int pixelformat) ++{ ++ unsigned int sizeimage; ++ ++ memset(format, 0, sizeof(*format)); ++ format->type = type; ++ ++ sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(type)) { ++ format->fmt.pix_mp.width = width; ++ format->fmt.pix_mp.height = height; ++ format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage; ++ format->fmt.pix_mp.pixelformat = pixelformat; ++ } else { ++ format->fmt.pix.width = width; ++ format->fmt.pix.height = height; ++ format->fmt.pix.sizeimage = sizeimage; ++ format->fmt.pix.pixelformat = pixelformat; ++ } ++} ++ ++static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat, ++ unsigned int width, unsigned int height) ++{ ++ struct v4l2_format format; ++ ++ v4l2_setup_format(&format, type, width, height, pixelformat); ++ ++ return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0; ++} ++ ++static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities) ++{ ++ struct v4l2_capability capability = { 0 }; ++ int rc; ++ ++ rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability); ++ if (rc < 0) ++ return -errno; ++ ++ if (capabilities != NULL) { ++ if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0) ++ *capabilities = capability.device_caps; ++ else ++ *capabilities = capability.capabilities; ++ } ++ ++ return 0; ++} ++ ++static int devscan_add(struct devscan *const scan, ++ enum v4l2_buf_type src_type, ++ uint32_t src_fmt_v4l2, ++ const char * vname, ++ const char * mname) ++{ ++ struct decdev *d; ++ ++ if (scan->dev_size <= scan->dev_count) { ++ unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2; ++ d = realloc(scan->devs, n * sizeof(*d)); ++ if (!d) ++ return -ENOMEM; ++ scan->devs = d; ++ scan->dev_size = n; ++ } ++ ++ d = scan->devs + scan->dev_count; ++ d->src_type = src_type; ++ d->src_fmt_v4l2 = src_fmt_v4l2; ++ d->vname = strdup(vname); ++ if (!d->vname) ++ return -ENOMEM; ++ d->mname = strdup(mname); ++ if (!d->mname) { ++ free((char *)d->vname); ++ return -ENOMEM; ++ } ++ ++scan->dev_count; ++ return 0; ++} ++ ++void devscan_delete(struct devscan **const pScan) ++{ ++ unsigned int i; ++ struct devscan * const scan = *pScan; ++ ++ if (!scan) ++ return; ++ *pScan = NULL; ++ ++ for (i = 0; i < scan->dev_count; ++i) { ++ free((char*)scan->devs[i].mname); ++ free((char*)scan->devs[i].vname); ++ } ++ free(scan->devs); ++ free(scan); ++} ++ ++#define REQ_BUF_CAPS (\ ++ V4L2_BUF_CAP_SUPPORTS_DMABUF |\ ++ V4L2_BUF_CAP_SUPPORTS_REQUESTS |\ ++ V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF) ++ ++static void probe_formats(void * const dc, ++ struct devscan *const scan, ++ const int fd, ++ const unsigned int type_v4l2, ++ const char *const mpath, ++ const char *const vpath) ++{ ++ unsigned int i; ++ for (i = 0;; ++i) { ++ struct v4l2_fmtdesc fmtdesc = { ++ .index = i, ++ .type = type_v4l2 ++ }; ++ struct v4l2_requestbuffers rbufs = { ++ .count = 0, ++ .type = type_v4l2, ++ .memory = V4L2_MEMORY_MMAP ++ }; ++ while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) { ++ if (errno == EINTR) ++ continue; ++ if (errno != EINVAL) ++ request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2); ++ return; ++ } ++ if (!video_src_pixfmt_supported(fmtdesc.pixelformat)) ++ continue; ++ ++ if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) { ++ request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat); ++ continue; ++ } ++ ++ while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) { ++ if (errno != EINTR) { ++ request_debug(dc, "%s: Reqbufs failed\n", vpath); ++ continue; ++ } ++ } ++ ++ if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) { ++ request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities); ++ continue; ++ } ++ ++ request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n", ++ mpath, vpath, fmtdesc.pixelformat, type_v4l2); ++ devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath); ++ } ++} ++ ++ ++static int probe_video_device(void * const dc, ++ struct udev_device *const device, ++ struct devscan *const scan, ++ const char *const mpath) ++{ ++ int ret; ++ unsigned int capabilities = 0; ++ int video_fd = -1; ++ ++ const char *path = udev_device_get_devnode(device); ++ if (!path) { ++ request_err(dc, "%s: get video device devnode failed\n", __func__); ++ ret = -EINVAL; ++ goto fail; ++ } ++ ++ video_fd = open(path, O_RDWR, 0); ++ if (video_fd == -1) { ++ ret = -errno; ++ request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno); ++ goto fail; ++ } ++ ++ ret = v4l2_query_capabilities(video_fd, &capabilities); ++ if (ret < 0) { ++ request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret); ++ goto fail; ++ } ++ ++ request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities); ++ ++ if (!(capabilities & V4L2_CAP_STREAMING)) { ++ request_debug(dc, "%s: missing required streaming capability\n", __func__); ++ ret = -EINVAL; ++ goto fail; ++ } ++ ++ if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) { ++ request_debug(dc, "%s: missing required mem2mem capability\n", __func__); ++ ret = -EINVAL; ++ goto fail; ++ } ++ ++ /* Should check capture formats too... */ ++ if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0) ++ probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path); ++ if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) ++ probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path); ++ ++ close(video_fd); ++ return 0; ++ ++fail: ++ if (video_fd >= 0) ++ close(video_fd); ++ return ret; ++} ++ ++static int probe_media_device(void * const dc, ++ struct udev_device *const device, ++ struct devscan *const scan) ++{ ++ int ret; ++ int rv; ++ struct media_device_info device_info = { 0 }; ++ struct media_v2_topology topology = { 0 }; ++ struct media_v2_interface *interfaces = NULL; ++ struct udev *udev = udev_device_get_udev(device); ++ struct udev_device *video_device; ++ dev_t devnum; ++ int media_fd = -1; ++ ++ const char *path = udev_device_get_devnode(device); ++ if (!path) { ++ request_err(dc, "%s: get media device devnode failed\n", __func__); ++ ret = -EINVAL; ++ goto fail; ++ } ++ ++ media_fd = open(path, O_RDWR, 0); ++ if (media_fd < 0) { ++ ret = -errno; ++ request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret); ++ goto fail; ++ } ++ ++ rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info); ++ if (rv < 0) { ++ ret = -errno; ++ request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret); ++ goto fail; ++ } ++ ++ rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology); ++ if (rv < 0) { ++ ret = -errno; ++ request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret); ++ goto fail; ++ } ++ ++ if (topology.num_interfaces <= 0) { ++ request_err(dc, "%s: media device has no interfaces\n", __func__); ++ ret = -EINVAL; ++ goto fail; ++ } ++ ++ interfaces = calloc(topology.num_interfaces, sizeof(*interfaces)); ++ if (!interfaces) { ++ request_err(dc, "%s: allocating media interface struct failed\n", __func__); ++ ret = -ENOMEM; ++ goto fail; ++ } ++ ++ topology.ptr_interfaces = (__u64)(uintptr_t)interfaces; ++ rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology); ++ if (rv < 0) { ++ ret = -errno; ++ request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret); ++ goto fail; ++ } ++ ++ for (int i = 0; i < topology.num_interfaces; i++) { ++ if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO) ++ continue; ++ ++ devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor); ++ video_device = udev_device_new_from_devnum(udev, 'c', devnum); ++ if (!video_device) { ++ ret = -errno; ++ request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device); ++ continue; ++ } ++ ++ ret = probe_video_device(dc, video_device, scan, path); ++ udev_device_unref(video_device); ++ ++ if (ret != 0) ++ goto fail; ++ } ++ ++fail: ++ free(interfaces); ++ if (media_fd != -1) ++ close(media_fd); ++ return ret; ++} ++ ++const char *decdev_media_path(const struct decdev *const dev) ++{ ++ return !dev ? NULL : dev->mname; ++} ++ ++const char *decdev_video_path(const struct decdev *const dev) ++{ ++ return !dev ? NULL : dev->vname; ++} ++ ++enum v4l2_buf_type decdev_src_type(const struct decdev *const dev) ++{ ++ return !dev ? 0 : dev->src_type; ++} ++ ++uint32_t decdev_src_pixelformat(const struct decdev *const dev) ++{ ++ return !dev ? 0 : dev->src_fmt_v4l2; ++} ++ ++ ++const struct decdev *devscan_find(struct devscan *const scan, ++ const uint32_t src_fmt_v4l2) ++{ ++ unsigned int i; ++ ++ if (scan->env.mname && scan->env.vname) ++ return &scan->env; ++ ++ if (!src_fmt_v4l2) ++ return scan->dev_count ? scan->devs + 0 : NULL; ++ ++ for (i = 0; i != scan->dev_count; ++i) { ++ if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2) ++ return scan->devs + i; ++ } ++ return NULL; ++} ++ ++int devscan_build(void * const dc, struct devscan **pscan) ++{ ++ int ret; ++ struct udev *udev; ++ struct udev_enumerate *enumerate; ++ struct udev_list_entry *devices; ++ struct udev_list_entry *entry; ++ struct udev_device *device; ++ struct devscan * scan; ++ ++ *pscan = NULL; ++ ++ scan = calloc(1, sizeof(*scan)); ++ if (!scan) { ++ ret = -ENOMEM; ++ goto fail; ++ } ++ ++ scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH"); ++ scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH"); ++ if (scan->env.mname && scan->env.vname) { ++ request_info(dc, "Media/video device env overrides found: %s,%s\n", ++ scan->env.mname, scan->env.vname); ++ *pscan = scan; ++ return 0; ++ } ++ ++ udev = udev_new(); ++ if (!udev) { ++ request_err(dc, "%s: allocating udev context failed\n", __func__); ++ ret = -ENOMEM; ++ goto fail; ++ } ++ ++ enumerate = udev_enumerate_new(udev); ++ if (!enumerate) { ++ request_err(dc, "%s: allocating udev enumerator failed\n", __func__); ++ ret = -ENOMEM; ++ goto fail; ++ } ++ ++ udev_enumerate_add_match_subsystem(enumerate, "media"); ++ udev_enumerate_scan_devices(enumerate); ++ ++ devices = udev_enumerate_get_list_entry(enumerate); ++ udev_list_entry_foreach(entry, devices) { ++ const char *path = udev_list_entry_get_name(entry); ++ if (!path) ++ continue; ++ ++ device = udev_device_new_from_syspath(udev, path); ++ if (!device) ++ continue; ++ ++ probe_media_device(dc, device, scan); ++ udev_device_unref(device); ++ } ++ ++ udev_enumerate_unref(enumerate); ++ udev_unref(udev); ++ ++ *pscan = scan; ++ return 0; ++ ++fail: ++ if (udev) ++ udev_unref(udev); ++ devscan_delete(&scan); ++ return ret; ++} ++ +diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h +new file mode 100644 +index 0000000000..956d9234f1 +--- /dev/null ++++ b/libavcodec/v4l2_req_devscan.h +@@ -0,0 +1,23 @@ ++#ifndef _DEVSCAN_H_ ++#define _DEVSCAN_H_ ++ ++#include ++ ++struct devscan; ++struct decdev; ++enum v4l2_buf_type; ++ ++/* These return pointers to data in the devscan structure and so are vaild ++ * for the lifetime of that ++ */ ++const char *decdev_media_path(const struct decdev *const dev); ++const char *decdev_video_path(const struct decdev *const dev); ++enum v4l2_buf_type decdev_src_type(const struct decdev *const dev); ++uint32_t decdev_src_pixelformat(const struct decdev *const dev); ++ ++const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2); ++ ++int devscan_build(void * const dc, struct devscan **pscan); ++void devscan_delete(struct devscan **const pScan); ++ ++#endif +diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c +new file mode 100644 +index 0000000000..acc0366e76 +--- /dev/null ++++ b/libavcodec/v4l2_req_dmabufs.c +@@ -0,0 +1,369 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "v4l2_req_dmabufs.h" ++#include "v4l2_req_utils.h" ++ ++#define DMABUF_NAME1 "/dev/dma_heap/linux,cma" ++#define DMABUF_NAME2 "/dev/dma_heap/reserved" ++ ++#define TRACE_ALLOC 0 ++ ++struct dmabufs_ctl; ++struct dmabuf_h; ++ ++struct dmabuf_fns { ++ int (*buf_alloc)(struct dmabufs_ctl * dbsc, struct dmabuf_h * dh, size_t size); ++ void (*buf_free)(struct dmabuf_h * dh); ++ int (*ctl_new)(struct dmabufs_ctl * dbsc); ++ void (*ctl_free)(struct dmabufs_ctl * dbsc); ++}; ++ ++struct dmabufs_ctl { ++ atomic_int ref_count; ++ int fd; ++ size_t page_size; ++ void * v; ++ const struct dmabuf_fns * fns; ++}; ++ ++struct dmabuf_h { ++ int fd; ++ size_t size; ++ size_t len; ++ void * mapptr; ++ void * v; ++ const struct dmabuf_fns * fns; ++}; ++ ++#if TRACE_ALLOC ++static unsigned int total_bufs = 0; ++static size_t total_size = 0; ++#endif ++ ++struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size) ++{ ++ struct dmabuf_h *dh; ++ ++ if (mapptr == MAP_FAILED) ++ return NULL; ++ ++ dh = malloc(sizeof(*dh)); ++ if (!dh) ++ return NULL; ++ ++ *dh = (struct dmabuf_h) { ++ .fd = -1, ++ .size = size, ++ .mapptr = mapptr ++ }; ++ ++ return dh; ++} ++ ++struct dmabuf_h * dmabuf_import(int fd, size_t size) ++{ ++ struct dmabuf_h *dh; ++ ++ fd = dup(fd); ++ if (fd < 0 || size == 0) ++ return NULL; ++ ++ dh = malloc(sizeof(*dh)); ++ if (!dh) { ++ close(fd); ++ return NULL; ++ } ++ ++ *dh = (struct dmabuf_h) { ++ .fd = fd, ++ .size = size, ++ .mapptr = MAP_FAILED ++ }; ++ ++#if TRACE_ALLOC ++ ++total_bufs; ++ total_size += dh->size; ++ request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); ++#endif ++ ++ return dh; ++} ++ ++struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size) ++{ ++ struct dmabuf_h * dh; ++ if (old != NULL) { ++ if (old->size >= size) { ++ return old; ++ } ++ dmabuf_free(old); ++ } ++ ++ if (size == 0 || ++ (dh = malloc(sizeof(*dh))) == NULL) ++ return NULL; ++ ++ *dh = (struct dmabuf_h){ ++ .fd = -1, ++ .mapptr = MAP_FAILED, ++ .fns = dbsc->fns ++ }; ++ ++ if (dh->fns->buf_alloc(dbsc, dh, size) != 0) ++ goto fail; ++ ++ ++#if TRACE_ALLOC ++ ++total_bufs; ++ total_size += dh->size; ++ request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); ++#endif ++ ++ return dh; ++ ++fail: ++ free(dh); ++ return NULL; ++} ++ ++int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags) ++{ ++ struct dma_buf_sync sync = { ++ .flags = flags ++ }; ++ if (dh->fd == -1) ++ return 0; ++ while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) { ++ const int err = errno; ++ if (errno == EINTR) ++ continue; ++ request_log("%s: ioctl failed: flags=%#x\n", __func__, flags); ++ return -err; ++ } ++ return 0; ++} ++ ++int dmabuf_write_start(struct dmabuf_h * const dh) ++{ ++ return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE); ++} ++ ++int dmabuf_write_end(struct dmabuf_h * const dh) ++{ ++ return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE); ++} ++ ++int dmabuf_read_start(struct dmabuf_h * const dh) ++{ ++ if (!dmabuf_map(dh)) ++ return -1; ++ return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ); ++} ++ ++int dmabuf_read_end(struct dmabuf_h * const dh) ++{ ++ return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ); ++} ++ ++ ++void * dmabuf_map(struct dmabuf_h * const dh) ++{ ++ if (!dh) ++ return NULL; ++ if (dh->mapptr != MAP_FAILED) ++ return dh->mapptr; ++ dh->mapptr = mmap(NULL, dh->size, ++ PROT_READ | PROT_WRITE, ++ MAP_SHARED | MAP_POPULATE, ++ dh->fd, 0); ++ if (dh->mapptr == MAP_FAILED) { ++ request_log("%s: Map failed\n", __func__); ++ return NULL; ++ } ++ return dh->mapptr; ++} ++ ++int dmabuf_fd(const struct dmabuf_h * const dh) ++{ ++ if (!dh) ++ return -1; ++ return dh->fd; ++} ++ ++size_t dmabuf_size(const struct dmabuf_h * const dh) ++{ ++ if (!dh) ++ return 0; ++ return dh->size; ++} ++ ++size_t dmabuf_len(const struct dmabuf_h * const dh) ++{ ++ if (!dh) ++ return 0; ++ return dh->len; ++} ++ ++void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len) ++{ ++ dh->len = len; ++} ++ ++void dmabuf_free(struct dmabuf_h * dh) ++{ ++ if (!dh) ++ return; ++ ++#if TRACE_ALLOC ++ --total_bufs; ++ total_size -= dh->size; ++ request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); ++#endif ++ ++ dh->fns->buf_free(dh); ++ ++ if (dh->mapptr != MAP_FAILED && dh->mapptr != NULL) ++ munmap(dh->mapptr, dh->size); ++ if (dh->fd != -1) ++ while (close(dh->fd) == -1 && errno == EINTR) ++ /* loop */; ++ free(dh); ++} ++ ++static struct dmabufs_ctl * dmabufs_ctl_new2(const struct dmabuf_fns * const fns) ++{ ++ struct dmabufs_ctl * dbsc = calloc(1, sizeof(*dbsc)); ++ ++ if (!dbsc) ++ return NULL; ++ ++ dbsc->fd = -1; ++ dbsc->fns = fns; ++ dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE); ++ ++ if (fns->ctl_new(dbsc) != 0) ++ goto fail; ++ ++ return dbsc; ++ ++fail: ++ free(dbsc); ++ return NULL; ++} ++ ++static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc) ++{ ++ request_debug(NULL, "Free dmabuf ctl\n"); ++ ++ dbsc->fns->ctl_free(dbsc); ++ ++ free(dbsc); ++} ++ ++void dmabufs_ctl_unref(struct dmabufs_ctl ** const pDbsc) ++{ ++ struct dmabufs_ctl * const dbsc = *pDbsc; ++ ++ if (!dbsc) ++ return; ++ *pDbsc = NULL; ++ ++ if (atomic_fetch_sub(&dbsc->ref_count, 1) != 0) ++ return; ++ ++ dmabufs_ctl_free(dbsc); ++} ++ ++struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc) ++{ ++ atomic_fetch_add(&dbsc->ref_count, 1); ++ return dbsc; ++} ++ ++//----------------------------------------------------------------------------- ++// ++// Alloc dmabuf via CMA ++ ++static int ctl_cma_new(struct dmabufs_ctl * dbsc) ++{ ++ while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 && ++ errno == EINTR) ++ /* Loop */; ++ ++ if (dbsc->fd == -1) { ++ while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 && ++ errno == EINTR) ++ /* Loop */; ++ if (dbsc->fd == -1) { ++ request_log("Unable to open either %s or %s\n", ++ DMABUF_NAME1, DMABUF_NAME2); ++ return -1; ++ } ++ } ++ return 0; ++} ++ ++static void ctl_cma_free(struct dmabufs_ctl * dbsc) ++{ ++ if (dbsc->fd != -1) ++ while (close(dbsc->fd) == -1 && errno == EINTR) ++ /* loop */; ++ ++} ++ ++static int buf_cma_alloc(struct dmabufs_ctl * const dbsc, struct dmabuf_h * dh, size_t size) ++{ ++ struct dma_heap_allocation_data data = { ++ .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1), ++ .fd = 0, ++ .fd_flags = O_RDWR, ++ .heap_flags = 0 ++ }; ++ ++ while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) { ++ int err = errno; ++ request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n", ++ (uint64_t)data.len, ++ dbsc->fd, ++ err, ++ strerror(err)); ++ if (err == EINTR) ++ continue; ++ return -err; ++ } ++ ++ dh->fd = data.fd; ++ dh->size = (size_t)data.len; ++ return 0; ++} ++ ++static void buf_cma_free(struct dmabuf_h * dh) ++{ ++ // Nothing needed ++} ++ ++static const struct dmabuf_fns dmabuf_cma_fns = { ++ .buf_alloc = buf_cma_alloc, ++ .buf_free = buf_cma_free, ++ .ctl_new = ctl_cma_new, ++ .ctl_free = ctl_cma_free, ++}; ++ ++struct dmabufs_ctl * dmabufs_ctl_new(void) ++{ ++ request_debug(NULL, "Dmabufs using CMA\n");; ++ return dmabufs_ctl_new2(&dmabuf_cma_fns); ++} ++ +diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h +new file mode 100644 +index 0000000000..381ba2708d +--- /dev/null ++++ b/libavcodec/v4l2_req_dmabufs.h +@@ -0,0 +1,44 @@ ++#ifndef DMABUFS_H ++#define DMABUFS_H ++ ++#include ++ ++struct dmabufs_ctl; ++struct dmabuf_h; ++ ++struct dmabufs_ctl * dmabufs_ctl_new(void); ++void dmabufs_ctl_unref(struct dmabufs_ctl ** const pdbsc); ++struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc); ++ ++// Need not preserve old contents ++// On NULL return old buffer is freed ++struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size); ++ ++static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) { ++ return dmabuf_realloc(dbsc, NULL, size); ++} ++/* Create from existing fd - dups(fd) */ ++struct dmabuf_h * dmabuf_import(int fd, size_t size); ++/* Import an MMAP - return NULL if mapptr = MAP_FAIL */ ++struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size); ++ ++void * dmabuf_map(struct dmabuf_h * const dh); ++ ++/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */ ++int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags); ++ ++int dmabuf_write_start(struct dmabuf_h * const dh); ++int dmabuf_write_end(struct dmabuf_h * const dh); ++int dmabuf_read_start(struct dmabuf_h * const dh); ++int dmabuf_read_end(struct dmabuf_h * const dh); ++ ++int dmabuf_fd(const struct dmabuf_h * const dh); ++/* Allocated size */ ++size_t dmabuf_size(const struct dmabuf_h * const dh); ++/* Bytes in use */ ++size_t dmabuf_len(const struct dmabuf_h * const dh); ++/* Set bytes in use */ ++void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len); ++void dmabuf_free(struct dmabuf_h * dh); ++ ++#endif +diff --git a/libavcodec/v4l2_req_hevc_v1.c b/libavcodec/v4l2_req_hevc_v1.c +new file mode 100644 +index 0000000000..169b532832 +--- /dev/null ++++ b/libavcodec/v4l2_req_hevc_v1.c +@@ -0,0 +1,3 @@ ++#define HEVC_CTRLS_VERSION 1 ++#include "v4l2_req_hevc_vx.c" ++ +diff --git a/libavcodec/v4l2_req_hevc_v2.c b/libavcodec/v4l2_req_hevc_v2.c +new file mode 100644 +index 0000000000..42af98e156 +--- /dev/null ++++ b/libavcodec/v4l2_req_hevc_v2.c +@@ -0,0 +1,3 @@ ++#define HEVC_CTRLS_VERSION 2 ++#include "v4l2_req_hevc_vx.c" ++ +diff --git a/libavcodec/v4l2_req_hevc_v3.c b/libavcodec/v4l2_req_hevc_v3.c +new file mode 100644 +index 0000000000..dcc8d95632 +--- /dev/null ++++ b/libavcodec/v4l2_req_hevc_v3.c +@@ -0,0 +1,3 @@ ++#define HEVC_CTRLS_VERSION 3 ++#include "v4l2_req_hevc_vx.c" ++ +diff --git a/libavcodec/v4l2_req_hevc_v4.c b/libavcodec/v4l2_req_hevc_v4.c +new file mode 100644 +index 0000000000..c35579d8e0 +--- /dev/null ++++ b/libavcodec/v4l2_req_hevc_v4.c +@@ -0,0 +1,3 @@ ++#define HEVC_CTRLS_VERSION 4 ++#include "v4l2_req_hevc_vx.c" ++ +diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c +new file mode 100644 +index 0000000000..b98d8464ca +--- /dev/null ++++ b/libavcodec/v4l2_req_hevc_vx.c +@@ -0,0 +1,1360 @@ ++// File included by v4l2_req_hevc_v* - not compiled on its own ++ ++#include "decode.h" ++#include "hevcdec.h" ++#include "hwconfig.h" ++ ++#if HEVC_CTRLS_VERSION == 1 ++#include "hevc-ctrls-v1.h" ++ ++// Fixup renamed entries ++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT ++ ++#elif HEVC_CTRLS_VERSION == 2 ++#include "hevc-ctrls-v2.h" ++#elif HEVC_CTRLS_VERSION == 3 ++#include "hevc-ctrls-v3.h" ++#elif HEVC_CTRLS_VERSION == 4 ++#include ++#if !defined(V4L2_CID_STATELESS_HEVC_SPS) ++#include "hevc-ctrls-v4.h" ++#endif ++#else ++#error Unknown HEVC_CTRLS_VERSION ++#endif ++ ++#ifndef V4L2_CID_STATELESS_HEVC_SPS ++#define V4L2_CID_STATELESS_HEVC_SPS V4L2_CID_MPEG_VIDEO_HEVC_SPS ++#define V4L2_CID_STATELESS_HEVC_PPS V4L2_CID_MPEG_VIDEO_HEVC_PPS ++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS ++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX ++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS ++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE ++#define V4L2_CID_STATELESS_HEVC_START_CODE V4L2_CID_MPEG_VIDEO_HEVC_START_CODE ++ ++#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED ++#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED ++#define V4L2_STATELESS_HEVC_START_CODE_NONE V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE ++#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B ++#endif ++ ++#include "v4l2_request_hevc.h" ++ ++#include "libavutil/hwcontext_drm.h" ++ ++#include ++#include ++ ++#include "v4l2_req_devscan.h" ++#include "v4l2_req_dmabufs.h" ++#include "v4l2_req_pollqueue.h" ++#include "v4l2_req_media.h" ++#include "v4l2_req_utils.h" ++ ++// Attached to buf[0] in frame ++// Pooled in hwcontext so generally create once - 1/frame ++typedef struct V4L2MediaReqDescriptor { ++ AVDRMFrameDescriptor drm; ++ ++ // Media ++ uint64_t timestamp; ++ struct qent_dst * qe_dst; ++ ++ // Decode only - should be NULL by the time we emit the frame ++ struct req_decode_ent decode_ent; ++ ++ struct media_request *req; ++ struct qent_src *qe_src; ++ ++#if HEVC_CTRLS_VERSION >= 2 ++ struct v4l2_ctrl_hevc_decode_params dec; ++#endif ++ ++ size_t num_slices; ++ size_t alloced_slices; ++ struct v4l2_ctrl_hevc_slice_params * slice_params; ++ struct slice_info * slices; ++ ++ size_t num_offsets; ++ size_t alloced_offsets; ++ uint32_t *offsets; ++ ++} V4L2MediaReqDescriptor; ++ ++struct slice_info { ++ const uint8_t * ptr; ++ size_t len; // bytes ++ size_t n_offsets; ++}; ++ ++// Handy container for accumulating controls before setting ++struct req_controls { ++ int has_scaling; ++ struct timeval tv; ++ struct v4l2_ctrl_hevc_sps sps; ++ struct v4l2_ctrl_hevc_pps pps; ++ struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix; ++}; ++ ++//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 }; ++ ++ ++// Get an FFmpeg format from the v4l2 format ++static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format) ++{ ++ switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ? ++ format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) { ++ case V4L2_PIX_FMT_YUV420: ++ return AV_PIX_FMT_YUV420P; ++ case V4L2_PIX_FMT_NV12: ++ return AV_PIX_FMT_NV12; ++#if CONFIG_SAND ++ case V4L2_PIX_FMT_NV12_COL128: ++ return AV_PIX_FMT_RPI4_8; ++ case V4L2_PIX_FMT_NV12_10_COL128: ++ return AV_PIX_FMT_RPI4_10; ++#endif ++ default: ++ break; ++ } ++ return AV_PIX_FMT_NONE; ++} ++ ++static inline uint64_t frame_capture_dpb(const AVFrame * const frame) ++{ ++ const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0]; ++ return rd->timestamp; ++} ++ ++static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp) ++{ ++ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0]; ++ rd->timestamp = dpb_stamp; ++} ++ ++static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table) ++{ ++ int32_t luma_weight_denom, chroma_weight_denom; ++ const SliceHeader *sh = &h->sh; ++ ++ if (sh->slice_type == HEVC_SLICE_I || ++ (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) || ++ (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag)) ++ return; ++ ++ table->luma_log2_weight_denom = sh->luma_log2_weight_denom; ++ ++ if (h->ps.sps->chroma_format_idc) ++ table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom; ++ ++ luma_weight_denom = (1 << sh->luma_log2_weight_denom); ++ chroma_weight_denom = (1 << sh->chroma_log2_weight_denom); ++ ++ for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) { ++ table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom; ++ table->luma_offset_l0[i] = sh->luma_offset_l0[i]; ++ table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom; ++ table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom; ++ table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0]; ++ table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1]; ++ } ++ ++ if (sh->slice_type != HEVC_SLICE_B) ++ return; ++ ++ for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) { ++ table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom; ++ table->luma_offset_l1[i] = sh->luma_offset_l1[i]; ++ table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom; ++ table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom; ++ table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0]; ++ table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1]; ++ } ++} ++ ++#if HEVC_CTRLS_VERSION <= 2 ++static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp) ++{ ++ const HEVCFrame *frame; ++ int i; ++ ++ for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) { ++ frame = h->rps[ST_CURR_BEF].ref[i]; ++ if (frame && timestamp == frame_capture_dpb(frame->frame)) ++ return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE; ++ } ++ ++ for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) { ++ frame = h->rps[ST_CURR_AFT].ref[i]; ++ if (frame && timestamp == frame_capture_dpb(frame->frame)) ++ return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER; ++ } ++ ++ for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) { ++ frame = h->rps[LT_CURR].ref[i]; ++ if (frame && timestamp == frame_capture_dpb(frame->frame)) ++ return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR; ++ } ++ ++ return 0; ++} ++#endif ++ ++static unsigned int ++get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame, ++ const struct v4l2_hevc_dpb_entry * const entries, ++ const unsigned int num_entries) ++{ ++ uint64_t timestamp; ++ ++ if (!frame) ++ return 0; ++ ++ timestamp = frame_capture_dpb(frame->frame); ++ ++ for (unsigned int i = 0; i < num_entries; i++) { ++ if (entries[i].timestamp == timestamp) ++ return i; ++ } ++ ++ return 0; ++} ++ ++static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx) ++{ ++ unsigned int z = 0; ++ while (idx--) { ++ if (*b++ == 0) { ++ ++z; ++ if (z >= 2 && *b == 3) { ++ ++b; ++ z = 0; ++ } ++ } ++ else { ++ z = 0; ++ } ++ } ++ return b; ++} ++ ++static int slice_add(V4L2MediaReqDescriptor * const rd) ++{ ++ if (rd->num_slices >= rd->alloced_slices) { ++ struct v4l2_ctrl_hevc_slice_params * p2; ++ struct slice_info * s2; ++ size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2; ++ ++ p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2)); ++ if (p2 == NULL) ++ return AVERROR(ENOMEM); ++ rd->slice_params = p2; ++ ++ s2 = av_realloc_array(rd->slices, n2, sizeof(*s2)); ++ if (s2 == NULL) ++ return AVERROR(ENOMEM); ++ rd->slices = s2; ++ ++ rd->alloced_slices = n2; ++ } ++ ++rd->num_slices; ++ return 0; ++} ++ ++static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets) ++{ ++ if (rd->num_offsets + n > rd->alloced_offsets) { ++ size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2; ++ void * p2; ++ while (rd->num_offsets + n > n2) ++ n2 *= 2; ++ if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL) ++ return AVERROR(ENOMEM); ++ rd->offsets = p2; ++ rd->alloced_offsets = n2; ++ } ++ for (size_t i = 0; i != n; ++i) ++ rd->offsets[rd->num_offsets++] = offsets[i] - 1; ++ return 0; ++} ++ ++static unsigned int ++fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries) ++{ ++ unsigned int i; ++ unsigned int n = 0; ++ const HEVCFrame * const pic = h->ref; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) { ++ const HEVCFrame * const frame = &h->DPB[i]; ++ if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) { ++ struct v4l2_hevc_dpb_entry * const entry = entries + n++; ++ ++ entry->timestamp = frame_capture_dpb(frame->frame); ++#if HEVC_CTRLS_VERSION <= 2 ++ entry->rps = find_frame_rps_type(h, entry->timestamp); ++#else ++ entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 : ++ V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE; ++#endif ++ entry->field_pic = frame->frame->interlaced_frame; ++ ++#if HEVC_CTRLS_VERSION <= 3 ++ /* TODO: Interleaved: Get the POC for each field. */ ++ entry->pic_order_cnt[0] = frame->poc; ++ entry->pic_order_cnt[1] = frame->poc; ++#else ++ entry->pic_order_cnt_val = frame->poc; ++#endif ++ } ++ } ++ return n; ++} ++ ++static void fill_slice_params(const HEVCContext * const h, ++#if HEVC_CTRLS_VERSION >= 2 ++ const struct v4l2_ctrl_hevc_decode_params * const dec, ++#endif ++ struct v4l2_ctrl_hevc_slice_params *slice_params, ++ uint32_t bit_size, uint32_t bit_offset) ++{ ++ const SliceHeader * const sh = &h->sh; ++#if HEVC_CTRLS_VERSION >= 2 ++ const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb; ++ const unsigned int dpb_n = dec->num_active_dpb_entries; ++#else ++ struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb; ++ unsigned int dpb_n; ++#endif ++ unsigned int i; ++ RefPicList *rpl; ++ ++ *slice_params = (struct v4l2_ctrl_hevc_slice_params) { ++ .bit_size = bit_size, ++#if HEVC_CTRLS_VERSION <= 3 ++ .data_bit_offset = bit_offset, ++#else ++ .data_byte_offset = bit_offset / 8 + 1, ++#endif ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ .slice_segment_addr = sh->slice_segment_addr, ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ ++ .nal_unit_type = h->nal_unit_type, ++ .nuh_temporal_id_plus1 = h->temporal_id + 1, ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ .slice_type = sh->slice_type, ++ .colour_plane_id = sh->colour_plane_id, ++ .slice_pic_order_cnt = h->ref->poc, ++ .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0, ++ .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0, ++ .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0, ++ .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand, ++ .slice_qp_delta = sh->slice_qp_delta, ++ .slice_cb_qp_offset = sh->slice_cb_qp_offset, ++ .slice_cr_qp_offset = sh->slice_cr_qp_offset, ++ .slice_act_y_qp_offset = 0, ++ .slice_act_cb_qp_offset = 0, ++ .slice_act_cr_qp_offset = 0, ++ .slice_beta_offset_div2 = sh->beta_offset / 2, ++ .slice_tc_offset_div2 = sh->tc_offset / 2, ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ ++ .pic_struct = h->sei.picture_timing.picture_struct, ++ ++#if HEVC_CTRLS_VERSION < 2 ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs, ++ .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs, ++ .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs, ++#endif ++ }; ++ ++ if (sh->slice_sample_adaptive_offset_flag[0]) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA; ++ ++ if (sh->slice_sample_adaptive_offset_flag[1]) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA; ++ ++ if (sh->slice_temporal_mvp_enabled_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED; ++ ++ if (sh->mvd_l1_zero_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO; ++ ++ if (sh->cabac_init_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT; ++ ++ if (sh->collocated_list == L0) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0; ++ ++ if (sh->disable_deblocking_filter_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED; ++ ++ if (sh->slice_loop_filter_across_slices_enabled_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED; ++ ++ if (sh->dependent_slice_segment_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT; ++ ++#if HEVC_CTRLS_VERSION < 2 ++ dpb_n = fill_dpb_entries(h, dpb); ++ slice_params->num_active_dpb_entries = dpb_n; ++#endif ++ ++ if (sh->slice_type != HEVC_SLICE_I) { ++ rpl = &h->ref->refPicList[0]; ++ for (i = 0; i < rpl->nb_refs; i++) ++ slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n); ++ } ++ ++ if (sh->slice_type == HEVC_SLICE_B) { ++ rpl = &h->ref->refPicList[1]; ++ for (i = 0; i < rpl->nb_refs; i++) ++ slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n); ++ } ++ ++ fill_pred_table(h, &slice_params->pred_weight_table); ++ ++ slice_params->num_entry_point_offsets = sh->num_entry_point_offsets; ++#if HEVC_CTRLS_VERSION <= 3 ++ if (slice_params->num_entry_point_offsets > 256) { ++ slice_params->num_entry_point_offsets = 256; ++ av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets); ++ } ++ ++ for (i = 0; i < slice_params->num_entry_point_offsets; i++) ++ slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1; ++#endif ++} ++ ++#if HEVC_CTRLS_VERSION >= 2 ++static void ++fill_decode_params(const HEVCContext * const h, ++ struct v4l2_ctrl_hevc_decode_params * const dec) ++{ ++ unsigned int i; ++ ++ *dec = (struct v4l2_ctrl_hevc_decode_params){ ++ .pic_order_cnt_val = h->poc, ++ .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs, ++ .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs, ++ .num_poc_lt_curr = h->rps[LT_CURR].nb_refs, ++ }; ++ ++ dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb); ++ ++ // The docn does seem to ask that we fit our 32 bit signed POC into ++ // a U8 so... (To be fair 16 bits would be enough) ++ // Luckily we (Pi) don't use these fields ++ for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i) ++ dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc; ++ for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i) ++ dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc; ++ for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i) ++ dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc; ++ ++ if (IS_IRAP(h)) ++ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC; ++ if (IS_IDR(h)) ++ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC; ++ if (h->sh.no_output_of_prior_pics_flag) ++ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR; ++ ++} ++#endif ++ ++static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps) ++{ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ ++ *ctrl = (struct v4l2_ctrl_hevc_sps) { ++ .chroma_format_idc = sps->chroma_format_idc, ++ .pic_width_in_luma_samples = sps->width, ++ .pic_height_in_luma_samples = sps->height, ++ .bit_depth_luma_minus8 = sps->bit_depth - 8, ++ .bit_depth_chroma_minus8 = sps->bit_depth - 8, ++ .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4, ++ .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1, ++ .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics, ++ .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1, ++ .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3, ++ .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size, ++ .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2, ++ .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size, ++ .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter, ++ .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra, ++ .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1, ++ .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1, ++ .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3, ++ .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size, ++ .num_short_term_ref_pic_sets = sps->nb_st_rps, ++ .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps, ++ .chroma_format_idc = sps->chroma_format_idc, ++ .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1, ++ }; ++ ++ if (sps->separate_colour_plane_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE; ++ ++ if (sps->scaling_list_enable_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED; ++ ++ if (sps->amp_enabled_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED; ++ ++ if (sps->sao_enabled) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET; ++ ++ if (sps->pcm_enabled_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED; ++ ++ if (sps->pcm.loop_filter_disable_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED; ++ ++ if (sps->long_term_ref_pics_present_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT; ++ ++ if (sps->sps_temporal_mvp_enabled_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED; ++ ++ if (sps->sps_strong_intra_smoothing_enable_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED; ++} ++ ++static void fill_scaling_matrix(const ScalingList * const sl, ++ struct v4l2_ctrl_hevc_scaling_matrix * const sm) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < 6; i++) { ++ unsigned int j; ++ ++ for (j = 0; j < 16; j++) ++ sm->scaling_list_4x4[i][j] = sl->sl[0][i][j]; ++ for (j = 0; j < 64; j++) { ++ sm->scaling_list_8x8[i][j] = sl->sl[1][i][j]; ++ sm->scaling_list_16x16[i][j] = sl->sl[2][i][j]; ++ if (i < 2) ++ sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j]; ++ } ++ sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i]; ++ if (i < 2) ++ sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3]; ++ } ++} ++ ++static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps) ++{ ++ uint64_t flags = 0; ++ ++ if (pps->dependent_slice_segments_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED; ++ ++ if (pps->output_flag_present_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT; ++ ++ if (pps->sign_data_hiding_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED; ++ ++ if (pps->cabac_init_present_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT; ++ ++ if (pps->constrained_intra_pred_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED; ++ ++ if (pps->transform_skip_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED; ++ ++ if (pps->cu_qp_delta_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED; ++ ++ if (pps->pic_slice_level_chroma_qp_offsets_present_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT; ++ ++ if (pps->weighted_pred_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED; ++ ++ if (pps->weighted_bipred_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED; ++ ++ if (pps->transquant_bypass_enable_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED; ++ ++ if (pps->tiles_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED; ++ ++ if (pps->entropy_coding_sync_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED; ++ ++ if (pps->loop_filter_across_tiles_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED; ++ ++ if (pps->seq_loop_filter_across_slices_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED; ++ ++ if (pps->deblocking_filter_override_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED; ++ ++ if (pps->disable_dbf) ++ flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER; ++ ++ if (pps->lists_modification_present_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT; ++ ++ if (pps->slice_header_extension_present_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ ++ *ctrl = (struct v4l2_ctrl_hevc_pps) { ++ .num_extra_slice_header_bits = pps->num_extra_slice_header_bits, ++ .init_qp_minus26 = pps->pic_init_qp_minus26, ++ .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth, ++ .pps_cb_qp_offset = pps->cb_qp_offset, ++ .pps_cr_qp_offset = pps->cr_qp_offset, ++ .pps_beta_offset_div2 = pps->beta_offset / 2, ++ .pps_tc_offset_div2 = pps->tc_offset / 2, ++ .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2, ++ .flags = flags ++ }; ++ ++ ++ if (pps->tiles_enabled_flag) { ++ ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1; ++ ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1; ++ ++ for (int i = 0; i < pps->num_tile_columns; i++) ++ ctrl->column_width_minus1[i] = pps->column_width[i] - 1; ++ ++ for (int i = 0; i < pps->num_tile_rows; i++) ++ ctrl->row_height_minus1[i] = pps->row_height[i] - 1; ++ } ++} ++ ++// Called before finally returning the frame to the user ++// Set corrupt flag here as this is actually the frame structure that ++// is going to the user (in MT land each thread has its own pool) ++static int frame_post_process(void *logctx, AVFrame *frame) ++{ ++ V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0]; ++ ++// av_log(NULL, AV_LOG_INFO, "%s\n", __func__); ++ frame->flags &= ~AV_FRAME_FLAG_CORRUPT; ++ if (rd->qe_dst) { ++ MediaBufsStatus stat = qent_dst_wait(rd->qe_dst); ++ if (stat != MEDIABUFS_STATUS_SUCCESS) { ++ av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__); ++ frame->flags |= AV_FRAME_FLAG_CORRUPT; ++ } ++ } ++ ++ return 0; ++} ++ ++static inline struct timeval cvt_dpb_to_tv(uint64_t t) ++{ ++ t /= 1000; ++ return (struct timeval){ ++ .tv_usec = t % 1000000, ++ .tv_sec = t / 1000000 ++ }; ++} ++ ++static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t) ++{ ++ return (uint64_t)t * 1000; ++} ++ ++static int v4l2_request_hevc_start_frame(AVCodecContext *avctx, ++ av_unused const uint8_t *buffer, ++ av_unused uint32_t size) ++{ ++ const HEVCContext *h = avctx->priv_data; ++ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0]; ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ ++// av_log(NULL, AV_LOG_INFO, "%s\n", __func__); ++ decode_q_add(&ctx->decode_q, &rd->decode_ent); ++ ++ rd->num_slices = 0; ++ ctx->timestamp++; ++ rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp); ++ ++ { ++ FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data; ++ fdd->post_process = frame_post_process; ++ } ++ ++ // qe_dst needs to be bound to the data buffer and only returned when that is ++ if (!rd->qe_dst) ++ { ++ if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__); ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame ++ ++ return 0; ++} ++ ++// Object fd & size will be zapped by this & need setting later ++static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format) ++{ ++ AVDRMLayerDescriptor *layer = &desc->layers[0]; ++ unsigned int width; ++ unsigned int height; ++ unsigned int bpl; ++ uint32_t pixelformat; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) { ++ width = format->fmt.pix_mp.width; ++ height = format->fmt.pix_mp.height; ++ pixelformat = format->fmt.pix_mp.pixelformat; ++ bpl = format->fmt.pix_mp.plane_fmt[0].bytesperline; ++ } ++ else { ++ width = format->fmt.pix.width; ++ height = format->fmt.pix.height; ++ pixelformat = format->fmt.pix.pixelformat; ++ bpl = format->fmt.pix.bytesperline; ++ } ++ ++ switch (pixelformat) { ++ case V4L2_PIX_FMT_NV12: ++ layer->format = DRM_FORMAT_NV12; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ break; ++#if CONFIG_SAND ++ case V4L2_PIX_FMT_NV12_COL128: ++ layer->format = DRM_FORMAT_NV12; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl); ++ break; ++ case V4L2_PIX_FMT_NV12_10_COL128: ++ layer->format = DRM_FORMAT_P030; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl); ++ break; ++#endif ++#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED ++ case V4L2_PIX_FMT_SUNXI_TILED_NV12: ++ layer->format = DRM_FORMAT_NV12; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED; ++ break; ++#endif ++#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15) ++ case V4L2_PIX_FMT_NV15: ++ layer->format = DRM_FORMAT_NV15; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ break; ++#endif ++ case V4L2_PIX_FMT_NV16: ++ layer->format = DRM_FORMAT_NV16; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ break; ++#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20) ++ case V4L2_PIX_FMT_NV20: ++ layer->format = DRM_FORMAT_NV20; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ break; ++#endif ++ default: ++ return -1; ++ } ++ ++ desc->nb_objects = 1; ++ desc->objects[0].fd = -1; ++ desc->objects[0].size = 0; ++ ++ desc->nb_layers = 1; ++ layer->nb_planes = 2; ++ ++ layer->planes[0].object_index = 0; ++ layer->planes[0].offset = 0; ++ layer->planes[0].pitch = bpl; ++#if CONFIG_SAND ++ if (pixelformat == V4L2_PIX_FMT_NV12_COL128) { ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = height * 128; ++ layer->planes[0].pitch = width; ++ layer->planes[1].pitch = width; ++ } ++ else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) { ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = height * 128; ++ layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy ++ layer->planes[1].pitch = width * 2; ++ } ++ else ++#endif ++ { ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = layer->planes[0].pitch * height; ++ layer->planes[1].pitch = layer->planes[0].pitch; ++ } ++ ++ return 0; ++} ++ ++static int ++set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, ++ struct req_controls *const controls, ++#if HEVC_CTRLS_VERSION >= 2 ++ struct v4l2_ctrl_hevc_decode_params * const dec, ++#endif ++ struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count, ++ void * const offsets, const size_t offset_count) ++{ ++ int rv; ++#if HEVC_CTRLS_VERSION >= 2 ++ unsigned int n = 3; ++#else ++ unsigned int n = 2; ++#endif ++ ++ struct v4l2_ext_control control[6] = { ++ { ++ .id = V4L2_CID_STATELESS_HEVC_SPS, ++ .ptr = &controls->sps, ++ .size = sizeof(controls->sps), ++ }, ++ { ++ .id = V4L2_CID_STATELESS_HEVC_PPS, ++ .ptr = &controls->pps, ++ .size = sizeof(controls->pps), ++ }, ++#if HEVC_CTRLS_VERSION >= 2 ++ { ++ .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS, ++ .ptr = dec, ++ .size = sizeof(*dec), ++ }, ++#endif ++ }; ++ ++ if (slices) ++ control[n++] = (struct v4l2_ext_control) { ++ .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, ++ .ptr = slices, ++ .size = sizeof(*slices) * slice_count, ++ }; ++ ++ if (controls->has_scaling) ++ control[n++] = (struct v4l2_ext_control) { ++ .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX, ++ .ptr = &controls->scaling_matrix, ++ .size = sizeof(controls->scaling_matrix), ++ }; ++ ++#if HEVC_CTRLS_VERSION >= 4 ++ if (offsets) ++ control[n++] = (struct v4l2_ext_control) { ++ .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, ++ .ptr = offsets, ++ .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count, ++ }; ++#endif ++ ++ rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n); ++ ++ return rv; ++} ++ ++// This only works because we started out from a single coded frame buffer ++// that will remain intact until after end_frame ++static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) ++{ ++ const HEVCContext * const h = avctx->priv_data; ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0]; ++ int bcount = get_bits_count(&h->HEVClc->gb); ++ uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount; ++ ++ const unsigned int n = rd->num_slices; ++ const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices; ++ ++ int rv; ++ struct slice_info * si; ++ ++ // This looks dodgy but we know that FFmpeg has parsed this from a buffer ++ // that contains the entire frame including the start code ++ if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) { ++ buffer -= 3; ++ size += 3; ++ boff += 24; ++ if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) { ++ av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n", ++ buffer[0], buffer[1], buffer[2]); ++ } ++ } ++ ++ if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) { ++ if (rd->slices == NULL) { ++ if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL) ++ return AVERROR(ENOMEM); ++ rd->slices->ptr = buffer; ++ rd->num_slices = 1; ++ } ++ rd->slices->len = buffer - rd->slices->ptr + size; ++ return 0; ++ } ++ ++ if ((rv = slice_add(rd)) != 0) ++ return rv; ++ ++ si = rd->slices + n; ++ si->ptr = buffer; ++ si->len = size; ++ si->n_offsets = rd->num_offsets; ++ ++ if (n != block_start) { ++ struct slice_info *const si0 = rd->slices + block_start; ++ const size_t offset = (buffer - si0->ptr); ++ boff += offset * 8; ++ size += offset; ++ si0->len = si->len + offset; ++ } ++ ++#if HEVC_CTRLS_VERSION >= 2 ++ if (n == 0) ++ fill_decode_params(h, &rd->dec); ++ fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff); ++#else ++ fill_slice_params(h, rd->slice_params + n, size * 8, boff); ++#endif ++ if (ctx->max_offsets != 0 && ++ (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0) ++ return rv; ++ ++ return 0; ++} ++ ++static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx) ++{ ++ const HEVCContext * const h = avctx->priv_data; ++ if (h->ref != NULL) { ++ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0]; ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ ++ media_request_abort(&rd->req); ++ mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src); ++ ++ decode_q_remove(&ctx->decode_q, &rd->decode_ent); ++ } ++} ++ ++static int send_slice(AVCodecContext * const avctx, ++ V4L2MediaReqDescriptor * const rd, ++ struct req_controls *const controls, ++ const unsigned int i, const unsigned int j) ++{ ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ ++ const int is_last = (j == rd->num_slices); ++ struct slice_info *const si = rd->slices + i; ++ struct media_request * req = NULL; ++ struct qent_src * src = NULL; ++ MediaBufsStatus stat; ++ void * offsets = rd->offsets + rd->slices[i].n_offsets; ++ size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets; ++ ++ if ((req = media_request_get(ctx->mpool)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__); ++ return AVERROR(ENOMEM); ++ } ++ ++ if (set_req_ctls(ctx, req, ++ controls, ++#if HEVC_CTRLS_VERSION >= 2 ++ &rd->dec, ++#endif ++ rd->slice_params + i, j - i, ++ offsets, n_offsets)) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__); ++ goto fail1; ++ } ++ ++ if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__); ++ goto fail1; ++ } ++ ++ if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__); ++ goto fail2; ++ } ++ ++ if (qent_src_params_set(src, &controls->tv)) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__); ++ goto fail2; ++ } ++ ++ stat = mediabufs_start_request(ctx->mbufs, &req, &src, ++ i == 0 ? rd->qe_dst : NULL, ++ is_last); ++ ++ if (stat != MEDIABUFS_STATUS_SUCCESS) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__); ++ return AVERROR_UNKNOWN; ++ } ++ return 0; ++ ++fail2: ++ mediabufs_src_qent_abort(ctx->mbufs, &src); ++fail1: ++ media_request_abort(&req); ++ return AVERROR_UNKNOWN; ++} ++ ++static int v4l2_request_hevc_end_frame(AVCodecContext *avctx) ++{ ++ const HEVCContext * const h = avctx->priv_data; ++ V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0]; ++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; ++ struct req_controls rc; ++ unsigned int i; ++ int rv; ++ ++ // It is possible, though maybe a bug, to get an end_frame without ++ // a previous start_frame. If we do then give up. ++ if (!decode_q_in_q(&rd->decode_ent)) { ++ av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ { ++ const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ? ++ &h->ps.pps->scaling_list : ++ h->ps.sps->scaling_list_enable_flag ? ++ &h->ps.sps->scaling_list : NULL; ++ ++ ++ memset(&rc, 0, sizeof(rc)); ++ rc.tv = cvt_dpb_to_tv(rd->timestamp); ++ fill_sps(&rc.sps, h->ps.sps); ++ fill_pps(&rc.pps, h->ps.pps); ++ if (sl) { ++ rc.has_scaling = 1; ++ fill_scaling_matrix(sl, &rc.scaling_matrix); ++ } ++ } ++ ++ decode_q_wait(&ctx->decode_q, &rd->decode_ent); ++ ++ // qe_dst needs to be bound to the data buffer and only returned when that is ++ // Alloc almost certainly wants to be serialised if there is any chance of blocking ++ // so we get the next frame to be free in the thread that needs it for decode first. ++ // ++ // In our current world this probably isn't a concern but put it here anyway ++ if (!rd->qe_dst) ++ { ++ if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__); ++ rv = AVERROR(ENOMEM); ++ goto fail; ++ } ++ } ++ ++ // Send as slices ++ for (i = 0; i < rd->num_slices; i += ctx->max_slices) { ++ const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices); ++ if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0) ++ goto fail; ++ } ++ ++ // Set the drm_prime desriptor ++ drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs)); ++ rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0)); ++ rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0)); ++ ++ decode_q_remove(&ctx->decode_q, &rd->decode_ent); ++ return 0; ++ ++fail: ++ decode_q_remove(&ctx->decode_q, &rd->decode_ent); ++ return rv; ++} ++ ++static inline int ++ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v) ++{ ++ return v >= c->minimum && v <= c->maximum; ++} ++ ++// Initial check & init ++static int ++probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) ++{ ++ const HEVCContext *h = avctx->priv_data; ++ const HEVCSPS * const sps = h->ps.sps; ++ struct v4l2_ctrl_hevc_sps ctrl_sps; ++ unsigned int i; ++ ++ // Check for var slice array ++ struct v4l2_query_ext_ctrl qc[] = { ++ { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS }, ++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, ++ { .id = V4L2_CID_STATELESS_HEVC_SPS }, ++ { .id = V4L2_CID_STATELESS_HEVC_PPS }, ++ { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX }, ++#if HEVC_CTRLS_VERSION >= 2 ++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS }, ++#endif ++ }; ++ // Order & size must match! ++ static const size_t ctrl_sizes[] = { ++ sizeof(struct v4l2_ctrl_hevc_slice_params), ++ sizeof(int32_t), ++ sizeof(struct v4l2_ctrl_hevc_sps), ++ sizeof(struct v4l2_ctrl_hevc_pps), ++ sizeof(struct v4l2_ctrl_hevc_scaling_matrix), ++#if HEVC_CTRLS_VERSION >= 2 ++ sizeof(struct v4l2_ctrl_hevc_decode_params), ++#endif ++ }; ++ const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc); ++ ++#if HEVC_CTRLS_VERSION == 2 ++ if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0)) ++ return AVERROR(EINVAL); ++#elif HEVC_CTRLS_VERSION == 3 ++ if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0)) ++ return AVERROR(EINVAL); ++#endif ++ ++ mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls); ++ i = 0; ++#if HEVC_CTRLS_VERSION >= 4 ++ // Skip slice check if no slice mode ++ if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) ++ i = 1; ++#else ++ // Fail frame mode silently for anything prior to V4 ++ if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) ++ return AVERROR(EINVAL); ++#endif ++ for (; i != noof_ctrls; ++i) { ++ if (qc[i].type == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id); ++ return AVERROR(EINVAL); ++ } ++ if (ctrl_sizes[i] != (size_t)qc[i].elem_size) { ++ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n", ++ HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size); ++ return AVERROR(EINVAL); ++ } ++ } ++ ++ fill_sps(&ctrl_sps, sps); ++ ++ if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ return 0; ++} ++ ++// Final init ++static int ++set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) ++{ ++ int ret; ++ ++ struct v4l2_query_ext_ctrl querys[] = { ++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, ++ { .id = V4L2_CID_STATELESS_HEVC_START_CODE, }, ++ { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, }, ++#if HEVC_CTRLS_VERSION >= 4 ++ { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, }, ++#endif ++ }; ++ ++ struct v4l2_ext_control ctrls[] = { ++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, ++ { .id = V4L2_CID_STATELESS_HEVC_START_CODE, }, ++ }; ++ ++ mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys)); ++ ++ ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) || ++ querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ? ++ 1 : querys[2].dims[0]; ++ av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices); ++ ++#if HEVC_CTRLS_VERSION >= 4 ++ ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ? ++ 0 : querys[3].dims[0]; ++ av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets); ++#else ++ ctx->max_offsets = 0; ++#endif ++ ++ if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED || ++ querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) ++ ctx->decode_mode = querys[0].default_value; ++ else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)) ++ ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED; ++ else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) ++ ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED; ++ else { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__); ++ return AVERROR(EINVAL); ++ } ++ ++ if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE || ++ querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) ++ ctx->start_code = querys[1].default_value; ++ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)) ++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B; ++ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE)) ++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE; ++ else { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__); ++ return AVERROR(EINVAL); ++ } ++ ++ // If we are in slice mode & START_CODE_NONE supported then pick that ++ // as it doesn't require the slightly dodgy look backwards in our raw buffer ++ if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED && ++ ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE)) ++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE; ++ ++ ctrls[0].value = ctx->decode_mode; ++ ctrls[1].value = ctx->start_code; ++ ++ ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls)); ++ return !ret ? 0 : AVERROR(-ret); ++} ++ ++static void v4l2_req_frame_free(void *opaque, uint8_t *data) ++{ ++ AVCodecContext *avctx = opaque; ++ V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data; ++ ++ av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data); ++ ++ qent_dst_unref(&rd->qe_dst); ++ ++ // We don't expect req or qe_src to be set ++ if (rd->req || rd->qe_src) ++ av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src); ++ ++ av_freep(&rd->slices); ++ av_freep(&rd->slice_params); ++ av_freep(&rd->offsets); ++ ++ av_free(rd); ++} ++ ++static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size) ++{ ++ AVCodecContext *avctx = opaque; ++// V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; ++// V4L2MediaReqDescriptor *req; ++ AVBufferRef *ref; ++ uint8_t *data; ++// int ret; ++ ++ data = av_mallocz(size); ++ if (!data) ++ return NULL; ++ ++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data); ++ ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0); ++ if (!ref) { ++ av_freep(&data); ++ return NULL; ++ } ++ return ref; ++} ++ ++#if 0 ++static void v4l2_req_pool_free(void *opaque) ++{ ++ av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque); ++} ++ ++static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc) ++{ ++ av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool); ++ ++ av_buffer_pool_uninit(&hwfc->pool); ++} ++#endif ++ ++static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) ++{ ++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; ++ AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data; ++ const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs); ++ ++ hwfc->format = AV_PIX_FMT_DRM_PRIME; ++ hwfc->sw_format = pixel_format_from_format(vfmt); ++ if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) { ++ hwfc->width = vfmt->fmt.pix_mp.width; ++ hwfc->height = vfmt->fmt.pix_mp.height; ++ } else { ++ hwfc->width = vfmt->fmt.pix.width; ++ hwfc->height = vfmt->fmt.pix.height; ++ } ++#if 0 ++ hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free); ++ if (!hwfc->pool) ++ return AVERROR(ENOMEM); ++ ++ hwfc->free = v4l2_req_hwframe_ctx_free; ++ ++ hwfc->initial_pool_size = 1; ++ ++ switch (avctx->codec_id) { ++ case AV_CODEC_ID_VP9: ++ hwfc->initial_pool_size += 8; ++ break; ++ case AV_CODEC_ID_VP8: ++ hwfc->initial_pool_size += 3; ++ break; ++ default: ++ hwfc->initial_pool_size += 2; ++ } ++#endif ++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size); ++ ++ return 0; ++} ++ ++static int alloc_frame(AVCodecContext * avctx, AVFrame *frame) ++{ ++ int rv; ++ ++ frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor)); ++ if (!frame->buf[0]) ++ return AVERROR(ENOMEM); ++ ++ frame->data[0] = frame->buf[0]->data; ++ ++ frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx); ++ ++ if ((rv = ff_attach_decode_data(frame)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n"); ++ av_frame_unref(frame); ++ return rv; ++ } ++ ++ return 0; ++} ++ ++const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = { ++ .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE, ++ .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION), ++ .probe = probe, ++ .set_controls = set_controls, ++ ++ .start_frame = v4l2_request_hevc_start_frame, ++ .decode_slice = v4l2_request_hevc_decode_slice, ++ .end_frame = v4l2_request_hevc_end_frame, ++ .abort_frame = v4l2_request_hevc_abort_frame, ++ .frame_params = frame_params, ++ .alloc_frame = alloc_frame, ++}; ++ +diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c +new file mode 100644 +index 0000000000..1a9944774a +--- /dev/null ++++ b/libavcodec/v4l2_req_media.c +@@ -0,0 +1,1802 @@ ++/* ++ * Copyright (C) 2018 Paul Kocialkowski ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sub license, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the ++ * next paragraph) shall be included in all copies or substantial portions ++ * of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. ++ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "v4l2_req_dmabufs.h" ++#include "v4l2_req_media.h" ++#include "v4l2_req_pollqueue.h" ++#include "v4l2_req_utils.h" ++#include "weak_link.h" ++ ++ ++/* floor(log2(x)) */ ++static unsigned int log2_size(size_t x) ++{ ++ unsigned int n = 0; ++ ++ if (x & ~0xffff) { ++ n += 16; ++ x >>= 16; ++ } ++ if (x & ~0xff) { ++ n += 8; ++ x >>= 8; ++ } ++ if (x & ~0xf) { ++ n += 4; ++ x >>= 4; ++ } ++ if (x & ~3) { ++ n += 2; ++ x >>= 2; ++ } ++ return (x & ~1) ? n + 1 : n; ++} ++ ++static size_t round_up_size(const size_t x) ++{ ++ /* Admit no size < 256 */ ++ const unsigned int n = x < 256 ? 8 : log2_size(x) - 1; ++ ++ return x >= (3 << n) ? 4 << n : (3 << n); ++} ++ ++struct media_request; ++ ++struct media_pool { ++ int fd; ++ sem_t sem; ++ pthread_mutex_t lock; ++ struct media_request * free_reqs; ++ struct pollqueue * pq; ++}; ++ ++struct media_request { ++ struct media_request * next; ++ struct media_pool * mp; ++ int fd; ++ struct polltask * pt; ++}; ++ ++static inline enum v4l2_memory ++mediabufs_memory_to_v4l2(const enum mediabufs_memory m) ++{ ++ return (enum v4l2_memory)m; ++} ++ ++const char * ++mediabufs_memory_name(const enum mediabufs_memory m) ++{ ++ switch (m) { ++ case MEDIABUFS_MEMORY_UNSET: ++ return "Unset"; ++ case MEDIABUFS_MEMORY_MMAP: ++ return "MMap"; ++ case MEDIABUFS_MEMORY_USERPTR: ++ return "UserPtr"; ++ case MEDIABUFS_MEMORY_OVERLAY: ++ return "Overlay"; ++ case MEDIABUFS_MEMORY_DMABUF: ++ return "DMABuf"; ++ default: ++ break; ++ } ++ return "Unknown"; ++} ++ ++ ++static inline int do_trywait(sem_t *const sem) ++{ ++ while (sem_trywait(sem)) { ++ if (errno != EINTR) ++ return -errno; ++ } ++ return 0; ++} ++ ++static inline int do_wait(sem_t *const sem) ++{ ++ while (sem_wait(sem)) { ++ if (errno != EINTR) ++ return -errno; ++ } ++ return 0; ++} ++ ++static int request_buffers(int video_fd, unsigned int type, ++ enum mediabufs_memory memory, unsigned int buffers_count) ++{ ++ struct v4l2_requestbuffers buffers; ++ int rc; ++ ++ memset(&buffers, 0, sizeof(buffers)); ++ buffers.type = type; ++ buffers.memory = mediabufs_memory_to_v4l2(memory); ++ buffers.count = buffers_count; ++ ++ rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers); ++ if (rc < 0) { ++ rc = -errno; ++ request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc)); ++ return rc; ++ } ++ ++ return 0; ++} ++ ++ ++static int set_stream(int video_fd, unsigned int type, bool enable) ++{ ++ enum v4l2_buf_type buf_type = type; ++ int rc; ++ ++ rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF, ++ &buf_type); ++ if (rc < 0) { ++ rc = -errno; ++ request_log("Unable to %sable stream: %s\n", ++ enable ? "en" : "dis", strerror(-rc)); ++ return rc; ++ } ++ ++ return 0; ++} ++ ++ ++ ++struct media_request * media_request_get(struct media_pool * const mp) ++{ ++ struct media_request *req = NULL; ++ ++ /* Timeout handled by poll code */ ++ if (do_wait(&mp->sem)) ++ return NULL; ++ ++ pthread_mutex_lock(&mp->lock); ++ req = mp->free_reqs; ++ if (req) { ++ mp->free_reqs = req->next; ++ req->next = NULL; ++ } ++ pthread_mutex_unlock(&mp->lock); ++ return req; ++} ++ ++int media_request_fd(const struct media_request * const req) ++{ ++ return req->fd; ++} ++ ++int media_request_start(struct media_request * const req) ++{ ++ while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1) ++ { ++ const int err = errno; ++ if (err == EINTR) ++ continue; ++ request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err)); ++ return -err; ++ } ++ ++ pollqueue_add_task(req->pt, 2000); ++ return 0; ++} ++ ++static void media_request_done(void *v, short revents) ++{ ++ struct media_request *const req = v; ++ struct media_pool *const mp = req->mp; ++ ++ /* ** Not sure what to do about timeout */ ++ ++ if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0) ++ request_log("Unable to reinit media request: %s\n", ++ strerror(errno)); ++ ++ pthread_mutex_lock(&mp->lock); ++ req->next = mp->free_reqs; ++ mp->free_reqs = req; ++ pthread_mutex_unlock(&mp->lock); ++ sem_post(&mp->sem); ++} ++ ++int media_request_abort(struct media_request ** const preq) ++{ ++ struct media_request * const req = *preq; ++ ++ if (req == NULL) ++ return 0; ++ *preq = NULL; ++ ++ media_request_done(req, 0); ++ return 0; ++} ++ ++static void delete_req_chain(struct media_request * const chain) ++{ ++ struct media_request * next = chain; ++ while (next) { ++ struct media_request * const req = next; ++ next = req->next; ++ if (req->pt) ++ polltask_delete(&req->pt); ++ if (req->fd != -1) ++ close(req->fd); ++ free(req); ++ } ++} ++ ++struct media_pool * media_pool_new(const char * const media_path, ++ struct pollqueue * const pq, ++ const unsigned int n) ++{ ++ struct media_pool * const mp = calloc(1, sizeof(*mp)); ++ unsigned int i; ++ ++ if (!mp) ++ goto fail0; ++ ++ mp->pq = pq; ++ pthread_mutex_init(&mp->lock, NULL); ++ mp->fd = open(media_path, O_RDWR | O_NONBLOCK); ++ if (mp->fd == -1) { ++ request_log("Failed to open '%s': %s\n", media_path, strerror(errno)); ++ goto fail1; ++ } ++ ++ for (i = 0; i != n; ++i) { ++ struct media_request * req = malloc(sizeof(*req)); ++ if (!req) ++ goto fail4; ++ ++ *req = (struct media_request){ ++ .next = mp->free_reqs, ++ .mp = mp, ++ .fd = -1 ++ }; ++ mp->free_reqs = req; ++ ++ if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) { ++ request_log("Failed to alloc request %d: %s\n", i, strerror(errno)); ++ goto fail4; ++ } ++ ++ req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req); ++ if (!req->pt) ++ goto fail4; ++ } ++ ++ sem_init(&mp->sem, 0, n); ++ ++ return mp; ++ ++fail4: ++ delete_req_chain(mp->free_reqs); ++ close(mp->fd); ++ pthread_mutex_destroy(&mp->lock); ++fail1: ++ free(mp); ++fail0: ++ return NULL; ++} ++ ++void media_pool_delete(struct media_pool ** pMp) ++{ ++ struct media_pool * const mp = *pMp; ++ ++ if (!mp) ++ return; ++ *pMp = NULL; ++ ++ delete_req_chain(mp->free_reqs); ++ close(mp->fd); ++ sem_destroy(&mp->sem); ++ pthread_mutex_destroy(&mp->lock); ++ free(mp); ++} ++ ++ ++#define INDEX_UNSET (~(uint32_t)0) ++ ++enum qent_status { ++ QENT_NEW = 0, // Initial state - shouldn't last ++ QENT_FREE, // On free chain ++ QENT_PENDING, // User has ent ++ QENT_WAITING, // On inuse ++ QENT_DONE, // Frame rx ++ QENT_ERROR, // Error ++ QENT_IMPORT ++}; ++ ++struct qent_base { ++ atomic_int ref_count; ++ struct qent_base *next; ++ struct qent_base *prev; ++ enum qent_status status; ++ enum mediabufs_memory memtype; ++ uint32_t index; ++ struct dmabuf_h *dh[VIDEO_MAX_PLANES]; ++ struct timeval timestamp; ++}; ++ ++struct qent_src { ++ struct qent_base base; ++ int fixed_size; ++}; ++ ++struct qent_dst { ++ struct qent_base base; ++ bool waiting; ++ pthread_mutex_t lock; ++ pthread_cond_t cond; ++ struct ff_weak_link_client * mbc_wl; ++}; ++ ++struct qe_list_head { ++ struct qent_base *head; ++ struct qent_base *tail; ++}; ++ ++struct buf_pool { ++ enum mediabufs_memory memtype; ++ pthread_mutex_t lock; ++ sem_t free_sem; ++ struct qe_list_head free; ++ struct qe_list_head inuse; ++}; ++ ++ ++static inline struct qent_dst *base_to_dst(struct qent_base *be) ++{ ++ return (struct qent_dst *)be; ++} ++ ++static inline struct qent_src *base_to_src(struct qent_base *be) ++{ ++ return (struct qent_src *)be; ++} ++ ++ ++#define QENT_BASE_INITIALIZER(mtype) {\ ++ .ref_count = ATOMIC_VAR_INIT(0),\ ++ .status = QENT_NEW,\ ++ .memtype = (mtype),\ ++ .index = INDEX_UNSET\ ++} ++ ++static void qe_base_uninit(struct qent_base *const be) ++{ ++ unsigned int i; ++ for (i = 0; i != VIDEO_MAX_PLANES; ++i) { ++ dmabuf_free(be->dh[i]); ++ be->dh[i] = NULL; ++ } ++} ++ ++static void qe_src_free(struct qent_src *const be_src) ++{ ++ if (!be_src) ++ return; ++ qe_base_uninit(&be_src->base); ++ free(be_src); ++} ++ ++static struct qent_src * qe_src_new(enum mediabufs_memory mtype) ++{ ++ struct qent_src *const be_src = malloc(sizeof(*be_src)); ++ if (!be_src) ++ return NULL; ++ *be_src = (struct qent_src){ ++ .base = QENT_BASE_INITIALIZER(mtype) ++ }; ++ return be_src; ++} ++ ++static void qe_dst_free(struct qent_dst *const be_dst) ++{ ++ if (!be_dst) ++ return; ++ ++ ff_weak_link_unref(&be_dst->mbc_wl); ++ pthread_cond_destroy(&be_dst->cond); ++ pthread_mutex_destroy(&be_dst->lock); ++ qe_base_uninit(&be_dst->base); ++ free(be_dst); ++} ++ ++static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl, const enum mediabufs_memory memtype) ++{ ++ struct qent_dst *const be_dst = malloc(sizeof(*be_dst)); ++ if (!be_dst) ++ return NULL; ++ *be_dst = (struct qent_dst){ ++ .base = QENT_BASE_INITIALIZER(memtype), ++ .lock = PTHREAD_MUTEX_INITIALIZER, ++ .cond = PTHREAD_COND_INITIALIZER, ++ .mbc_wl = ff_weak_link_ref(wl) ++ }; ++ return be_dst; ++} ++ ++static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be) ++{ ++ if (ql->tail) ++ ql->tail->next = be; ++ else ++ ql->head = be; ++ be->prev = ql->tail; ++ be->next = NULL; ++ ql->tail = be; ++} ++ ++static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be) ++{ ++ if (!be) ++ return NULL; ++ ++ if (be->next) ++ be->next->prev = be->prev; ++ else ++ ql->tail = be->prev; ++ if (be->prev) ++ be->prev->next = be->next; ++ else ++ ql->head = be->next; ++ be->next = NULL; ++ be->prev = NULL; ++ return be; ++} ++ ++ ++static void bq_put_free(struct buf_pool *const bp, struct qent_base * be) ++{ ++ ql_add_tail(&bp->free, be); ++} ++ ++static struct qent_base * bq_get_free(struct buf_pool *const bp) ++{ ++ return ql_extract(&bp->free, bp->free.head); ++} ++ ++static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be) ++{ ++ return ql_extract(&bp->inuse, be); ++} ++ ++static struct qent_base * bq_get_inuse(struct buf_pool *const bp) ++{ ++ return ql_extract(&bp->inuse, bp->inuse.head); ++} ++ ++static void bq_free_all_free_src(struct buf_pool *const bp) ++{ ++ struct qent_base *be; ++ while ((be = bq_get_free(bp)) != NULL) ++ qe_src_free(base_to_src(be)); ++} ++ ++static void bq_free_all_inuse_src(struct buf_pool *const bp) ++{ ++ struct qent_base *be; ++ while ((be = bq_get_inuse(bp)) != NULL) ++ qe_src_free(base_to_src(be)); ++} ++ ++static void bq_free_all_free_dst(struct buf_pool *const bp) ++{ ++ struct qent_base *be; ++ while ((be = bq_get_free(bp)) != NULL) ++ qe_dst_free(base_to_dst(be)); ++} ++ ++static void queue_put_free(struct buf_pool *const bp, struct qent_base *be) ++{ ++ unsigned int i; ++ ++ pthread_mutex_lock(&bp->lock); ++ /* Clear out state vars */ ++ be->timestamp.tv_sec = 0; ++ be->timestamp.tv_usec = 0; ++ be->status = QENT_FREE; ++ for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) ++ dmabuf_len_set(be->dh[i], 0); ++ bq_put_free(bp, be); ++ pthread_mutex_unlock(&bp->lock); ++ sem_post(&bp->free_sem); ++} ++ ++static bool queue_is_inuse(const struct buf_pool *const bp) ++{ ++ return bp->inuse.tail != NULL; ++} ++ ++static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be) ++{ ++ if (!be) ++ return; ++ pthread_mutex_lock(&bp->lock); ++ ql_add_tail(&bp->inuse, be); ++ be->status = QENT_WAITING; ++ pthread_mutex_unlock(&bp->lock); ++} ++ ++static struct qent_base *queue_get_free(struct buf_pool *const bp) ++{ ++ struct qent_base *buf; ++ ++ if (do_wait(&bp->free_sem)) ++ return NULL; ++ pthread_mutex_lock(&bp->lock); ++ buf = bq_get_free(bp); ++ pthread_mutex_unlock(&bp->lock); ++ return buf; ++} ++ ++static struct qent_base *queue_tryget_free(struct buf_pool *const bp) ++{ ++ struct qent_base *buf; ++ ++ if (do_trywait(&bp->free_sem)) ++ return NULL; ++ pthread_mutex_lock(&bp->lock); ++ buf = bq_get_free(bp); ++ pthread_mutex_unlock(&bp->lock); ++ return buf; ++} ++ ++static struct qent_base * queue_find_extract_index(struct buf_pool *const bp, const unsigned int index) ++{ ++ struct qent_base *be; ++ ++ pthread_mutex_lock(&bp->lock); ++ /* Expect 1st in Q, but allow anywhere */ ++ for (be = bp->inuse.head; be; be = be->next) { ++ if (be->index == index) { ++ bq_extract_inuse(bp, be); ++ break; ++ } ++ } ++ pthread_mutex_unlock(&bp->lock); ++ ++ return be; ++} ++ ++static void queue_delete(struct buf_pool *const bp) ++{ ++ sem_destroy(&bp->free_sem); ++ pthread_mutex_destroy(&bp->lock); ++ free(bp); ++} ++ ++static struct buf_pool* queue_new(const int vfd) ++{ ++ struct buf_pool *bp = calloc(1, sizeof(*bp)); ++ if (!bp) ++ return NULL; ++ pthread_mutex_init(&bp->lock, NULL); ++ sem_init(&bp->free_sem, 0, 0); ++ return bp; ++} ++ ++ ++struct mediabufs_ctl { ++ atomic_int ref_count; /* 0 is single ref for easier atomics */ ++ void * dc; ++ int vfd; ++ bool stream_on; ++ bool polling; ++ bool dst_fixed; // Dst Q is fixed size ++ pthread_mutex_t lock; ++ struct buf_pool * src; ++ struct buf_pool * dst; ++ struct polltask * pt; ++ struct pollqueue * pq; ++ struct ff_weak_link_master * this_wlm; ++ ++ enum mediabufs_memory src_memtype; ++ enum mediabufs_memory dst_memtype; ++ struct v4l2_format src_fmt; ++ struct v4l2_format dst_fmt; ++ struct v4l2_capability capability; ++}; ++ ++static int qe_v4l2_queue(struct qent_base *const be, ++ const int vfd, struct media_request *const mreq, ++ const struct v4l2_format *const fmt, ++ const bool is_dst, const bool hold_flag) ++{ ++ struct v4l2_buffer buffer = { ++ .type = fmt->type, ++ .memory = mediabufs_memory_to_v4l2(be->memtype), ++ .index = be->index ++ }; ++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ unsigned int i; ++ for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) { ++ if (is_dst) ++ dmabuf_len_set(be->dh[i], 0); ++ ++ /* *** Really need a pixdesc rather than a format so we can fill in data_offset */ ++ planes[i].length = dmabuf_size(be->dh[i]); ++ planes[i].bytesused = dmabuf_len(be->dh[i]); ++ if (be->memtype == MEDIABUFS_MEMORY_DMABUF) ++ planes[i].m.fd = dmabuf_fd(be->dh[i]); ++ else ++ planes[i].m.mem_offset = 0; ++ } ++ buffer.m.planes = planes; ++ buffer.length = i; ++ } ++ else { ++ if (is_dst) ++ dmabuf_len_set(be->dh[0], 0); ++ ++ buffer.bytesused = dmabuf_len(be->dh[0]); ++ buffer.length = dmabuf_size(be->dh[0]); ++ if (be->memtype == MEDIABUFS_MEMORY_DMABUF) ++ buffer.m.fd = dmabuf_fd(be->dh[0]); ++ else ++ buffer.m.offset = 0; ++ } ++ ++ if (!is_dst && mreq) { ++ buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD; ++ buffer.request_fd = media_request_fd(mreq); ++ if (hold_flag) ++ buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF; ++ } ++ ++ if (is_dst) ++ be->timestamp = (struct timeval){0,0}; ++ ++ buffer.timestamp = be->timestamp; ++ ++ while (ioctl(vfd, VIDIOC_QBUF, &buffer)) { ++ const int err = errno; ++ if (err != EINTR) { ++ request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err)); ++ return -err; ++ } ++ } ++ return 0; ++} ++ ++static struct qent_base * qe_dequeue(struct buf_pool *const bp, ++ const int vfd, ++ const struct v4l2_format * const f) ++{ ++ struct qent_base *be; ++ int rc; ++ const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type); ++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; ++ struct v4l2_buffer buffer = { ++ .type = f->type, ++ .memory = mediabufs_memory_to_v4l2(bp->memtype) ++ }; ++ if (mp) { ++ buffer.length = f->fmt.pix_mp.num_planes; ++ buffer.m.planes = planes; ++ } ++ ++ while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 && ++ errno == EINTR) ++ /* Loop */; ++ if (rc) { ++ request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno)); ++ return NULL; ++ } ++ ++ be = queue_find_extract_index(bp, buffer.index); ++ if (!be) { ++ request_log("Failed to find index %d in Q\n", buffer.index); ++ return NULL; ++ } ++ ++ if (mp) { ++ unsigned int i; ++ for (i = 0; i != buffer.length; ++i) ++ dmabuf_len_set(be->dh[i], V4L2_TYPE_IS_CAPTURE(f->type) ? planes[i].bytesused : 0); ++ } ++ else ++ dmabuf_len_set(be->dh[0], V4L2_TYPE_IS_CAPTURE(f->type) ? buffer.length : 0); ++ ++ be->timestamp = buffer.timestamp; ++ be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE; ++ return be; ++} ++ ++static void qe_dst_done(struct qent_dst * dst_be) ++{ ++ pthread_mutex_lock(&dst_be->lock); ++ dst_be->waiting = false; ++ pthread_cond_broadcast(&dst_be->cond); ++ pthread_mutex_unlock(&dst_be->lock); ++ ++ qent_dst_unref(&dst_be); ++} ++ ++static bool qe_dst_waiting(struct qent_dst *const dst_be) ++{ ++ bool waiting; ++ pthread_mutex_lock(&dst_be->lock); ++ waiting = dst_be->waiting; ++ dst_be->waiting = true; ++ pthread_mutex_unlock(&dst_be->lock); ++ return waiting; ++} ++ ++ ++static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc) ++{ ++ return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst); ++} ++ ++static void mediabufs_poll_cb(void * v, short revents) ++{ ++ struct mediabufs_ctl *mbc = v; ++ struct qent_src *src_be = NULL; ++ struct qent_dst *dst_be = NULL; ++ ++ if (!revents) ++ request_err(mbc->dc, "%s: Timeout\n", __func__); ++ ++ pthread_mutex_lock(&mbc->lock); ++ mbc->polling = false; ++ ++ if ((revents & POLLOUT) != 0) ++ src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt)); ++ if ((revents & POLLIN) != 0) ++ dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt)); ++ ++ /* Reschedule */ ++ if (mediabufs_wants_poll(mbc)) { ++ mbc->polling = true; ++ pollqueue_add_task(mbc->pt, 2000); ++ } ++ pthread_mutex_unlock(&mbc->lock); ++ ++ if (src_be) ++ queue_put_free(mbc->src, &src_be->base); ++ if (dst_be) ++ qe_dst_done(dst_be); ++} ++ ++int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp) ++{ ++ struct qent_base *const be = &be_src->base; ++ ++ be->timestamp = *timestamp; ++ return 0; ++} ++ ++struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst) ++{ ++ return be_dst->base.timestamp; ++} ++ ++static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc) ++{ ++ if (!be->dh[0] || len > dmabuf_size(be->dh[0])) { ++ size_t newsize = round_up_size(len); ++ request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize); ++ if (!dbsc) { ++ request_log("%s: No dmbabuf_ctrl for realloc\n", __func__); ++ return -ENOMEM; ++ } ++ if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) { ++ request_log("%s: Realloc %zd failed\n", __func__, newsize); ++ return -ENOMEM; ++ } ++ } ++ return 0; ++} ++ ++int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc) ++{ ++ struct qent_base *const be = &be_src->base; ++ return qent_base_realloc(be, len, dbsc); ++} ++ ++ ++int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc) ++{ ++ void * dst; ++ struct qent_base *const be = &be_src->base; ++ int rv; ++ ++ // Realloc doesn't copy so don't alloc if offset != 0 ++ if ((rv = qent_base_realloc(be, offset + len, ++ be_src->fixed_size || offset ? NULL : dbsc)) != 0) ++ return rv; ++ ++ dmabuf_write_start(be->dh[0]); ++ dst = dmabuf_map(be->dh[0]); ++ if (!dst) ++ return -1; ++ memcpy((char*)dst + offset, src, len); ++ dmabuf_len_set(be->dh[0], len); ++ dmabuf_write_end(be->dh[0]); ++ return 0; ++} ++ ++const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane) ++{ ++ const struct qent_base *const be = &be_dst->base; ++ ++ return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane]; ++} ++ ++int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane) ++{ ++ return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane))); ++} ++ ++MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc, ++ struct media_request **const pmreq, ++ struct qent_src **const psrc_be, ++ struct qent_dst *const dst_be, ++ const bool is_final) ++{ ++ struct media_request * mreq = *pmreq; ++ struct qent_src *const src_be = *psrc_be; ++ ++ // Req & src are always both "consumed" ++ *pmreq = NULL; ++ *psrc_be = NULL; ++ ++ pthread_mutex_lock(&mbc->lock); ++ ++ if (!src_be) ++ goto fail1; ++ ++ if (dst_be) { ++ if (qe_dst_waiting(dst_be)) { ++ request_info(mbc->dc, "Request buffer already waiting on start\n"); ++ goto fail1; ++ } ++ dst_be->base.timestamp = (struct timeval){0,0}; ++ if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false)) ++ goto fail1; ++ ++ qent_dst_ref(dst_be); ++ queue_put_inuse(mbc->dst, &dst_be->base); ++ } ++ ++ if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final)) ++ goto fail1; ++ queue_put_inuse(mbc->src, &src_be->base); ++ ++ if (!mbc->polling && mediabufs_wants_poll(mbc)) { ++ mbc->polling = true; ++ pollqueue_add_task(mbc->pt, 2000); ++ } ++ pthread_mutex_unlock(&mbc->lock); ++ ++ if (media_request_start(mreq)) ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ ++ return MEDIABUFS_STATUS_SUCCESS; ++ ++fail1: ++ media_request_abort(&mreq); ++ if (src_be) ++ queue_put_free(mbc->src, &src_be->base); ++ ++// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q ++ if (dst_be) { ++ dst_be->base.status = QENT_ERROR; ++ qe_dst_done(dst_be); ++ } ++ pthread_mutex_unlock(&mbc->lock); ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++} ++ ++ ++static int qe_alloc_from_fmt(struct qent_base *const be, ++ struct dmabufs_ctl *const dbsc, ++ const struct v4l2_format *const fmt) ++{ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ unsigned int i; ++ for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) { ++ be->dh[i] = dmabuf_realloc(dbsc, be->dh[i], ++ fmt->fmt.pix_mp.plane_fmt[i].sizeimage); ++ /* On failure tidy up and die */ ++ if (!be->dh[i]) { ++ while (i--) { ++ dmabuf_free(be->dh[i]); ++ be->dh[i] = NULL; ++ } ++ return -1; ++ } ++ } ++ } ++ else { ++// be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage); ++ size_t size = fmt->fmt.pix.sizeimage; ++ be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size); ++ if (!be->dh[0]) ++ return -1; ++ } ++ return 0; ++} ++ ++static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd, ++ const enum v4l2_buf_type buftype, ++ uint32_t pixfmt, ++ const unsigned int width, const unsigned int height, ++ const size_t bufsize) ++{ ++ *fmt = (struct v4l2_format){.type = buftype}; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) { ++ fmt->fmt.pix_mp.width = width; ++ fmt->fmt.pix_mp.height = height; ++ fmt->fmt.pix_mp.pixelformat = pixfmt; ++ if (bufsize) { ++ fmt->fmt.pix_mp.num_planes = 1; ++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize; ++ } ++ } ++ else { ++ fmt->fmt.pix.width = width; ++ fmt->fmt.pix.height = height; ++ fmt->fmt.pix.pixelformat = pixfmt; ++ fmt->fmt.pix.sizeimage = bufsize; ++ } ++ ++ while (ioctl(fd, VIDIOC_S_FMT, fmt)) ++ if (errno != EINTR) ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ ++ // Treat anything where we don't get at least what we asked for as a fail ++ if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) { ++ if (fmt->fmt.pix_mp.width < width || ++ fmt->fmt.pix_mp.height < height || ++ fmt->fmt.pix_mp.pixelformat != pixfmt) { ++ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; ++ } ++ } ++ else { ++ if (fmt->fmt.pix.width < width || ++ fmt->fmt.pix.height < height || ++ fmt->fmt.pix.pixelformat != pixfmt) { ++ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; ++ } ++ } ++ ++ return MEDIABUFS_STATUS_SUCCESS; ++} ++ ++static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt, ++ const int fd, ++ const unsigned int type_v4l2, ++ const uint32_t flags_must, ++ const uint32_t flags_not, ++ const unsigned int width, ++ const unsigned int height, ++ mediabufs_dst_fmt_accept_fn *const accept_fn, ++ void *const accept_v) ++{ ++ unsigned int i; ++ ++ for (i = 0;; ++i) { ++ struct v4l2_fmtdesc fmtdesc = { ++ .index = i, ++ .type = type_v4l2 ++ }; ++ while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) { ++ if (errno != EINTR) ++ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; ++ } ++ if ((fmtdesc.flags & flags_must) != flags_must || ++ (fmtdesc.flags & flags_not)) ++ continue; ++ if (!accept_fn(accept_v, &fmtdesc)) ++ continue; ++ ++ if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat, ++ width, height, 0) == MEDIABUFS_STATUS_SUCCESS) ++ return MEDIABUFS_STATUS_SUCCESS; ++ } ++ return 0; ++} ++ ++ ++/* Wait for qent done */ ++ ++MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst) ++{ ++ struct qent_base *const be = &be_dst->base; ++ enum qent_status estat; ++ ++ pthread_mutex_lock(&be_dst->lock); ++ while (be_dst->waiting && ++ !pthread_cond_wait(&be_dst->cond, &be_dst->lock)) ++ /* Loop */; ++ estat = be->status; ++ pthread_mutex_unlock(&be_dst->lock); ++ ++ return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS : ++ estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR : ++ MEDIABUFS_ERROR_OPERATION_FAILED; ++} ++ ++const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no) ++{ ++ struct qent_base *const be = &be_dst->base; ++ return dmabuf_map(be->dh[buf_no]); ++} ++ ++MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst) ++{ ++ struct qent_base *const be = &be_dst->base; ++ unsigned int i; ++ for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) { ++ if (dmabuf_read_start(be->dh[i])) { ++ while (i--) ++ dmabuf_read_end(be->dh[i]); ++ return MEDIABUFS_ERROR_ALLOCATION_FAILED; ++ } ++ } ++ return MEDIABUFS_STATUS_SUCCESS; ++} ++ ++MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst) ++{ ++ struct qent_base *const be = &be_dst->base; ++ unsigned int i; ++ MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS; ++ ++ for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) { ++ if (dmabuf_read_end(be->dh[i])) ++ status = MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ return status; ++} ++ ++struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst) ++{ ++ if (be_dst) ++ atomic_fetch_add(&be_dst->base.ref_count, 1); ++ return be_dst; ++} ++ ++void qent_dst_unref(struct qent_dst ** const pbe_dst) ++{ ++ struct qent_dst * const be_dst = *pbe_dst; ++ struct mediabufs_ctl * mbc; ++ if (!be_dst) ++ return; ++ *pbe_dst = NULL; ++ ++ if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0) ++ return; ++ ++ if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) { ++ queue_put_free(mbc->dst, &be_dst->base); ++ ff_weak_link_unlock(be_dst->mbc_wl); ++ } ++ else { ++ qe_dst_free(be_dst); ++ } ++} ++ ++MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst, ++ unsigned int plane, ++ int fd, size_t size) ++{ ++ struct qent_base *const be = &be_dst->base; ++ struct dmabuf_h * dh; ++ ++ if (be->status != QENT_IMPORT || be->dh[plane]) ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ ++ dh = dmabuf_import(fd, size); ++ if (!dh) ++ return MEDIABUFS_ERROR_ALLOCATION_FAILED; ++ ++ be->dh[plane] = dh; ++ return MEDIABUFS_STATUS_SUCCESS; ++} ++ ++// Returns noof buffers created, -ve for error ++static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[]) ++{ ++ unsigned int i; ++ ++ struct v4l2_create_buffers cbuf = { ++ .count = n, ++ .memory = mediabufs_memory_to_v4l2(mbc->dst->memtype), ++ .format = mbc->dst_fmt, ++ }; ++ ++ while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) { ++ const int err = -errno; ++ if (err != EINTR) { ++ request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__); ++ return -err; ++ } ++ } ++ ++ if (cbuf.count != n) ++ request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n); ++ ++ for (i = 0; i != cbuf.count; ++i) ++ qes[i]->base.index = cbuf.index + i; ++ ++ return cbuf.count; ++} ++ ++static MediaBufsStatus ++qe_import_from_buf(struct mediabufs_ctl *const mbc, struct qent_base * const be, const struct v4l2_format *const fmt, ++ const unsigned int n, const bool x_dmabuf) ++{ ++ struct v4l2_buffer buf = { ++ .index = n, ++ .type = fmt->type, ++ }; ++ struct v4l2_plane planes[VIDEO_MAX_PLANES]; ++ int ret; ++ ++ if (be->dh[0]) ++ return 0; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ memset(planes, 0, sizeof(planes)); ++ buf.m.planes = planes; ++ buf.length = VIDEO_MAX_PLANES; ++ } ++ ++ if ((ret = ioctl(mbc->vfd, VIDIOC_QUERYBUF, &buf)) != 0) { ++ request_err(mbc->dc, "VIDIOC_QUERYBUF failed"); ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) ++ { ++ unsigned int i; ++ for (i = 0; i != buf.length; ++i) { ++ if (x_dmabuf) { ++ struct v4l2_exportbuffer xbuf = { ++ .type = buf.type, ++ .index = buf.index, ++ .plane = i, ++ .flags = O_RDWR, // *** Arguably O_RDONLY would be fine ++ }; ++ if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0) ++ be->dh[i] = dmabuf_import(xbuf.fd, planes[i].length); ++ } ++ else { ++ be->dh[i] = dmabuf_import_mmap( ++ mmap(NULL, planes[i].length, ++ PROT_READ | PROT_WRITE, ++ MAP_SHARED | MAP_POPULATE, ++ mbc->vfd, planes[i].m.mem_offset), ++ planes[i].length); ++ } ++ /* On failure tidy up and die */ ++ if (!be->dh[i]) { ++ while (i--) { ++ dmabuf_free(be->dh[i]); ++ be->dh[i] = NULL; ++ } ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ } ++ } ++ else ++ { ++ if (x_dmabuf) { ++ struct v4l2_exportbuffer xbuf = { ++ .type = buf.type, ++ .index = buf.index, ++ .flags = O_RDWR, // *** Arguably O_RDONLY would be fine ++ }; ++ if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0) ++ be->dh[0] = dmabuf_import(xbuf.fd, buf.length); ++ } ++ else { ++ be->dh[0] = dmabuf_import_mmap( ++ mmap(NULL, buf.length, ++ PROT_READ | PROT_WRITE, ++ MAP_SHARED | MAP_POPULATE, ++ mbc->vfd, buf.m.offset), ++ buf.length); ++ } ++ /* On failure tidy up and die */ ++ if (!be->dh[0]) { ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ } ++ ++ return 0; ++} ++ ++struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc) ++{ ++ struct qent_dst * be_dst; ++ ++ if (mbc == NULL) { ++ be_dst = qe_dst_new(NULL, MEDIABUFS_MEMORY_DMABUF); ++ if (be_dst) ++ be_dst->base.status = QENT_IMPORT; ++ return be_dst; ++ } ++ ++ if (mbc->dst_fixed) { ++ be_dst = base_to_dst(queue_get_free(mbc->dst)); ++ if (!be_dst) ++ return NULL; ++ } ++ else { ++ be_dst = base_to_dst(queue_tryget_free(mbc->dst)); ++ if (!be_dst) { ++ be_dst = qe_dst_new(mbc->this_wlm, mbc->dst->memtype); ++ if (!be_dst) ++ return NULL; ++ ++ if (create_dst_bufs(mbc, 1, &be_dst) != 1) { ++ qe_dst_free(be_dst); ++ return NULL; ++ } ++ } ++ } ++ ++ if (mbc->dst->memtype == MEDIABUFS_MEMORY_MMAP) { ++ if (qe_import_from_buf(mbc, &be_dst->base, &mbc->dst_fmt, be_dst->base.index, true)) { ++ request_err(mbc->dc, "Failed to export as dmabuf\n"); ++ queue_put_free(mbc->dst, &be_dst->base); ++ return NULL; ++ } ++ } ++ else { ++ if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) { ++ /* Given how create buf works we can't uncreate it on alloc failure ++ * all we can do is put it on the free Q ++ */ ++ queue_put_free(mbc->dst, &be_dst->base); ++ return NULL; ++ } ++ } ++ ++ be_dst->base.status = QENT_PENDING; ++ atomic_store(&be_dst->base.ref_count, 0); ++ return be_dst; ++} ++ ++const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc) ++{ ++ return &mbc->dst_fmt; ++} ++ ++MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc, ++ const unsigned int width, ++ const unsigned int height, ++ mediabufs_dst_fmt_accept_fn *const accept_fn, ++ void *const accept_v) ++{ ++ MediaBufsStatus status; ++ unsigned int i; ++ const enum v4l2_buf_type buf_type = mbc->dst_fmt.type; ++ static const struct { ++ unsigned int flags_must; ++ unsigned int flags_not; ++ } trys[] = { ++ {0, V4L2_FMT_FLAG_EMULATED}, ++ {V4L2_FMT_FLAG_EMULATED, 0}, ++ }; ++ for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) { ++ status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd, ++ buf_type, ++ trys[i].flags_must, ++ trys[i].flags_not, ++ width, height, accept_fn, accept_v); ++ if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE) ++ return status; ++ } ++ ++ if (status != MEDIABUFS_STATUS_SUCCESS) ++ return status; ++ ++ /* Try to create a buffer - don't alloc */ ++ return status; ++} ++ ++// ** This is a mess if we get partial alloc but without any way to remove ++// individual V4L2 Q members we are somewhat stuffed ++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype) ++{ ++ unsigned int i; ++ int a = 0; ++ unsigned int qc; ++ struct qent_dst * qes[32]; ++ ++ if (n > 32) ++ return MEDIABUFS_ERROR_ALLOCATION_FAILED; ++ ++ mbc->dst->memtype = memtype; ++ ++ // Create qents first as it is hard to get rid of the V4L2 buffers on error ++ for (qc = 0; qc != n; ++qc) ++ { ++ if ((qes[qc] = qe_dst_new(mbc->this_wlm, mbc->dst->memtype)) == NULL) ++ goto fail; ++ } ++ ++ if ((a = create_dst_bufs(mbc, n, qes)) < 0) ++ goto fail; ++ ++ for (i = 0; i != a; ++i) ++ queue_put_free(mbc->dst, &qes[i]->base); ++ ++ if (a != n) ++ goto fail; ++ ++ mbc->dst_fixed = fixed; ++ return MEDIABUFS_STATUS_SUCCESS; ++ ++fail: ++ for (i = (a < 0 ? 0 : a); i != qc; ++i) ++ qe_dst_free(qes[i]); ++ ++ return MEDIABUFS_ERROR_ALLOCATION_FAILED; ++} ++ ++struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc) ++{ ++ struct qent_base * buf = queue_get_free(mbc->src); ++ buf->status = QENT_PENDING; ++ return base_to_src(buf); ++} ++ ++void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src) ++{ ++ struct qent_src *const qe_src = *pqe_src; ++ if (!qe_src) ++ return; ++ *pqe_src = NULL; ++ queue_put_free(mbc->src, &qe_src->base); ++} ++ ++static MediaBufsStatus ++chk_memory_type(struct mediabufs_ctl *const mbc, ++ const struct v4l2_format * const f, ++ const enum mediabufs_memory m) ++{ ++ struct v4l2_create_buffers cbuf = { ++ .count = 0, ++ .memory = V4L2_MEMORY_MMAP, ++ .format = *f ++ }; ++ ++ if (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf) != 0) ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ ++ switch (m) { ++ case MEDIABUFS_MEMORY_DMABUF: ++ // 0 = Unknown but assume not in that case ++ if ((cbuf.capabilities & V4L2_BUF_CAP_SUPPORTS_DMABUF) == 0) ++ return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY; ++ break; ++ case MEDIABUFS_MEMORY_MMAP: ++ break; ++ default: ++ return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY; ++ } ++ ++ return MEDIABUFS_STATUS_SUCCESS; ++} ++ ++MediaBufsStatus ++mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype) ++{ ++ return chk_memory_type(mbc, &mbc->src_fmt, memtype); ++} ++ ++MediaBufsStatus ++mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype) ++{ ++ return chk_memory_type(mbc, &mbc->dst_fmt, memtype); ++} ++ ++/* src format must have been set up before this */ ++MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc, ++ struct dmabufs_ctl * const dbsc, ++ unsigned int n, const enum mediabufs_memory memtype) ++{ ++ unsigned int i; ++ struct v4l2_requestbuffers req = { ++ .count = n, ++ .type = mbc->src_fmt.type, ++ .memory = mediabufs_memory_to_v4l2(memtype) ++ }; ++ ++ bq_free_all_free_src(mbc->src); ++ ++ while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) { ++ if (errno != EINTR) { ++ request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__); ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ } ++ ++ if (n > req.count) { ++ request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n); ++ n = req.count; ++ } ++ ++ for (i = 0; i != n; ++i) { ++ struct qent_src *const be_src = qe_src_new(memtype); ++ if (!be_src) { ++ request_err(mbc->dc, "Failed to create src be %d\n", i); ++ goto fail; ++ } ++ switch (memtype) { ++ case MEDIABUFS_MEMORY_MMAP: ++ if (qe_import_from_buf(mbc, &be_src->base, &mbc->src_fmt, i, false)) { ++ qe_src_free(be_src); ++ goto fail; ++ } ++ be_src->fixed_size = 1; ++ break; ++ case MEDIABUFS_MEMORY_DMABUF: ++ if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) { ++ qe_src_free(be_src); ++ goto fail; ++ } ++ be_src->fixed_size = !mediabufs_src_resizable(mbc); ++ break; ++ default: ++ request_err(mbc->dc, "Unexpected memorty type\n"); ++ goto fail; ++ } ++ be_src->base.index = i; ++ ++ queue_put_free(mbc->src, &be_src->base); ++ } ++ ++ mbc->src->memtype = memtype; ++ return MEDIABUFS_STATUS_SUCCESS; ++ ++fail: ++ bq_free_all_free_src(mbc->src); ++ req.count = 0; ++ while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 && ++ errno == EINTR) ++ /* Loop */; ++ ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++} ++ ++ ++ ++/* ++ * Set stuff order: ++ * Set src fmt ++ * Set parameters (sps) on vfd ++ * Negotiate dst format (dst_fmt_set) ++ * Create src buffers ++ * Alloc a dst buffer or Create dst slots ++*/ ++MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc) ++{ ++ if (mbc->stream_on) ++ return MEDIABUFS_STATUS_SUCCESS; ++ ++ if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) { ++ request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type); ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ ++ if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) { ++ request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type); ++ set_stream(mbc->vfd, mbc->src_fmt.type, false); ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ ++ mbc->stream_on = true; ++ return MEDIABUFS_STATUS_SUCCESS; ++} ++ ++MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc) ++{ ++ MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS; ++ ++ if (!mbc->stream_on) ++ return MEDIABUFS_STATUS_SUCCESS; ++ ++ if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) { ++ request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type); ++ status = MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ ++ if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) { ++ request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type); ++ status = MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ ++ mbc->stream_on = false; ++ return status; ++} ++ ++int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n) ++{ ++ struct v4l2_ext_controls controls = { ++ .controls = control_array, ++ .count = n ++ }; ++ ++ if (mreq) { ++ controls.which = V4L2_CTRL_WHICH_REQUEST_VAL; ++ controls.request_fd = media_request_fd(mreq); ++ } ++ ++ while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls)) ++ { ++ const int err = errno; ++ if (err != EINTR) { ++ request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err)); ++ return -err; ++ } ++ } ++ ++ return 0; ++} ++ ++MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc, ++ struct media_request * const mreq, ++ unsigned int id, void *data, ++ unsigned int size) ++{ ++ struct v4l2_ext_control control = { ++ .id = id, ++ .ptr = data, ++ .size = size ++ }; ++ ++ int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1); ++ return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED; ++} ++ ++MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc, ++ enum v4l2_buf_type buf_type, ++ const uint32_t pixfmt, ++ const uint32_t width, const uint32_t height, ++ const size_t bufsize) ++{ ++ MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize); ++ if (rv != MEDIABUFS_STATUS_SUCCESS) ++ request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height); ++ ++ return rv; ++} ++ ++int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n) ++{ ++ int rv = 0; ++ while (n--) { ++ while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) { ++ const int err = errno; ++ if (err != EINTR) { ++ // Often used for probing - errors are to be expected ++ request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err); ++ ctrls->type = 0; // 0 is invalid ++ rv = -err; ++ break; ++ } ++ } ++ ++ctrls; ++ } ++ return rv; ++} ++ ++int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc) ++{ ++#if 1 ++ return 0; ++#else ++ // Single planar OUTPUT can only take exact size buffers ++ // Multiplanar will take larger than negotiated ++ return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type); ++#endif ++} ++ ++static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc) ++{ ++ if (!mbc) ++ return; ++ ++ // Break the weak link first ++ ff_weak_link_break(&mbc->this_wlm); ++ ++ polltask_delete(&mbc->pt); ++ ++ mediabufs_stream_off(mbc); ++ ++ // Empty v4l2 buffer stash ++ request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0); ++ request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0); ++ ++ bq_free_all_free_src(mbc->src); ++ bq_free_all_inuse_src(mbc->src); ++ bq_free_all_free_dst(mbc->dst); ++ ++ { ++ struct qent_dst *dst_be; ++ while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) { ++ dst_be->base.timestamp = (struct timeval){0}; ++ dst_be->base.status = QENT_ERROR; ++ qe_dst_done(dst_be); ++ } ++ } ++ ++ queue_delete(mbc->dst); ++ queue_delete(mbc->src); ++ close(mbc->vfd); ++ pthread_mutex_destroy(&mbc->lock); ++ ++ free(mbc); ++} ++ ++struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc) ++{ ++ atomic_fetch_add(&mbc->ref_count, 1); ++ return mbc; ++} ++ ++void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc) ++{ ++ struct mediabufs_ctl *const mbc = *pmbc; ++ int n; ++ ++ if (!mbc) ++ return; ++ *pmbc = NULL; ++ n = atomic_fetch_sub(&mbc->ref_count, 1); ++ if (n) ++ return; ++ mediabufs_ctl_delete(mbc); ++} ++ ++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc) ++{ ++ return mbc->capability.version; ++} ++ ++static int set_capabilities(struct mediabufs_ctl *const mbc) ++{ ++ uint32_t caps; ++ ++ if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) { ++ int err = errno; ++ request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err)); ++ return -err; ++ } ++ ++ caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ? ++ mbc->capability.device_caps : ++ mbc->capability.capabilities; ++ ++ if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) { ++ mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; ++ mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; ++ } ++ else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) { ++ mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ } ++ else { ++ request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++/* One of these per context */ ++struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq) ++{ ++ struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc)); ++ ++ if (!mbc) ++ return NULL; ++ ++ mbc->dc = dc; ++ // Default mono planar ++ mbc->pq = pq; ++ pthread_mutex_init(&mbc->lock, NULL); ++ ++ /* Pick a default - could we scan for this? */ ++ if (vpath == NULL) ++ vpath = "/dev/media0"; ++ ++ while ((mbc->vfd = open(vpath, O_RDWR)) == -1) ++ { ++ const int err = errno; ++ if (err != EINTR) { ++ request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err)); ++ goto fail0; ++ } ++ } ++ ++ if (set_capabilities(mbc)) { ++ request_err(dc, "Bad capabilities for video dev '%s'\n", vpath); ++ goto fail1; ++ } ++ ++ mbc->src = queue_new(mbc->vfd); ++ if (!mbc->src) ++ goto fail1; ++ mbc->dst = queue_new(mbc->vfd); ++ if (!mbc->dst) ++ goto fail2; ++ mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc); ++ if (!mbc->pt) ++ goto fail3; ++ mbc->this_wlm = ff_weak_link_new(mbc); ++ if (!mbc->this_wlm) ++ goto fail4; ++ ++ /* Cannot add polltask now - polling with nothing pending ++ * generates infinite error polls ++ */ ++ return mbc; ++ ++fail4: ++ polltask_delete(&mbc->pt); ++fail3: ++ queue_delete(mbc->dst); ++fail2: ++ queue_delete(mbc->src); ++fail1: ++ close(mbc->vfd); ++fail0: ++ free(mbc); ++ request_info(dc, "%s: FAILED\n", __func__); ++ return NULL; ++} ++ ++ ++ +diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h +new file mode 100644 +index 0000000000..890947b2e2 +--- /dev/null ++++ b/libavcodec/v4l2_req_media.h +@@ -0,0 +1,171 @@ ++/* ++e.h ++* ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sub license, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the ++ * next paragraph) shall be included in all copies or substantial portions ++ * of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. ++ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#ifndef _MEDIA_H_ ++#define _MEDIA_H_ ++ ++#include ++#include ++ ++struct v4l2_format; ++struct v4l2_fmtdesc; ++struct v4l2_query_ext_ctrl; ++ ++struct pollqueue; ++struct media_request; ++struct media_pool; ++ ++typedef enum media_buf_status { ++ MEDIABUFS_STATUS_SUCCESS = 0, ++ MEDIABUFS_ERROR_OPERATION_FAILED, ++ MEDIABUFS_ERROR_DECODING_ERROR, ++ MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE, ++ MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT, ++ MEDIABUFS_ERROR_ALLOCATION_FAILED, ++ MEDIABUFS_ERROR_UNSUPPORTED_MEMORY, ++} MediaBufsStatus; ++ ++struct media_pool * media_pool_new(const char * const media_path, ++ struct pollqueue * const pq, ++ const unsigned int n); ++void media_pool_delete(struct media_pool ** pmp); ++ ++// Obtain a media request ++// Will block if none availible - has a 2sec timeout ++struct media_request * media_request_get(struct media_pool * const mp); ++int media_request_fd(const struct media_request * const req); ++ ++// Start this request ++// Request structure is returned to pool once done ++int media_request_start(struct media_request * const req); ++ ++// Return an *unstarted* media_request to the pool ++// May later be upgraded to allow for aborting a started req ++int media_request_abort(struct media_request ** const preq); ++ ++ ++struct mediabufs_ctl; ++struct qent_src; ++struct qent_dst; ++struct dmabuf_h; ++struct dmabufs_ctl; ++ ++// 1-1 mammping to V4L2 type - just defined separetely to avoid some include versioning difficulties ++enum mediabufs_memory { ++ MEDIABUFS_MEMORY_UNSET = 0, ++ MEDIABUFS_MEMORY_MMAP = 1, ++ MEDIABUFS_MEMORY_USERPTR = 2, ++ MEDIABUFS_MEMORY_OVERLAY = 3, ++ MEDIABUFS_MEMORY_DMABUF = 4, ++}; ++ ++int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp); ++struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst); ++ ++// prealloc ++int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc); ++// dbsc may be NULL if realloc not required ++int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc); ++const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane); ++int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane); ++MediaBufsStatus qent_dst_wait(struct qent_dst *const be); ++void qent_dst_delete(struct qent_dst *const be); ++// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead ++void qent_dst_unref(struct qent_dst ** const pbe_dst); ++struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst); ++ ++const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no); ++MediaBufsStatus qent_dst_read_start(struct qent_dst *const be); ++MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be); ++/* Import an fd unattached to any mediabuf */ ++MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst, ++ unsigned int plane, ++ int fd, size_t size); ++ ++const char * mediabufs_memory_name(const enum mediabufs_memory m); ++ ++MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc, ++ struct media_request **const pmreq, ++ struct qent_src **const psrc_be, ++ struct qent_dst *const dst_be, ++ const bool is_final); ++// Get / alloc a dst buffer & associate with a slot ++// If the dst pool is empty then behaviour depends on the fixed flag passed to ++// dst_slots_create. Default is !fixed = unlimited alloc ++struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, ++ struct dmabufs_ctl *const dbsc); ++// Create dst slots without alloc ++// If fixed true then qent_alloc will only get slots from this pool and will ++// block until a qent has been unrefed ++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype); ++ ++MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc); ++MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc); ++const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc); ++ ++typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc); ++ ++MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc, ++ const unsigned int width, ++ const unsigned int height, ++ mediabufs_dst_fmt_accept_fn *const accept_fn, ++ void *const accept_v); ++struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc); ++void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src); ++ ++int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, ++ struct v4l2_ext_control control_array[], unsigned int n); ++MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc, ++ struct media_request * const mreq, ++ unsigned int id, void *data, ++ unsigned int size); ++int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n); ++ ++int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc); ++ ++MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc, ++ enum v4l2_buf_type buf_type, ++ const uint32_t pixfmt, ++ const uint32_t width, const uint32_t height, ++ const size_t bufsize); ++ ++MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw, ++ struct dmabufs_ctl * const dbsc, ++ unsigned int n, ++ const enum mediabufs_memory memtype); ++ ++// Want to have appropriate formats set first ++MediaBufsStatus mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype); ++MediaBufsStatus mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype); ++ ++#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c)) ++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc); ++ ++struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, ++ const char *vpath, struct pollqueue *const pq); ++void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc); ++struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc); ++ ++ ++#endif +diff --git a/libavcodec/v4l2_req_pollqueue.c b/libavcodec/v4l2_req_pollqueue.c +new file mode 100644 +index 0000000000..cc8a5d4001 +--- /dev/null ++++ b/libavcodec/v4l2_req_pollqueue.c +@@ -0,0 +1,361 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "v4l2_req_pollqueue.h" ++#include "v4l2_req_utils.h" ++ ++ ++struct pollqueue; ++ ++enum polltask_state { ++ POLLTASK_UNQUEUED = 0, ++ POLLTASK_QUEUED, ++ POLLTASK_RUNNING, ++ POLLTASK_Q_KILL, ++ POLLTASK_RUN_KILL, ++}; ++ ++struct polltask { ++ struct polltask *next; ++ struct polltask *prev; ++ struct pollqueue *q; ++ enum polltask_state state; ++ ++ int fd; ++ short events; ++ ++ void (*fn)(void *v, short revents); ++ void * v; ++ ++ uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */ ++ sem_t kill_sem; ++}; ++ ++struct pollqueue { ++ atomic_int ref_count; ++ pthread_mutex_t lock; ++ ++ struct polltask *head; ++ struct polltask *tail; ++ ++ bool kill; ++ bool no_prod; ++ int prod_fd; ++ struct polltask *prod_pt; ++ pthread_t worker; ++}; ++ ++struct polltask *polltask_new(struct pollqueue *const pq, ++ const int fd, const short events, ++ void (*const fn)(void *v, short revents), ++ void *const v) ++{ ++ struct polltask *pt; ++ ++ if (!events) ++ return NULL; ++ ++ pt = malloc(sizeof(*pt)); ++ if (!pt) ++ return NULL; ++ ++ *pt = (struct polltask){ ++ .next = NULL, ++ .prev = NULL, ++ .q = pollqueue_ref(pq), ++ .fd = fd, ++ .events = events, ++ .fn = fn, ++ .v = v ++ }; ++ ++ sem_init(&pt->kill_sem, 0, 0); ++ ++ return pt; ++} ++ ++static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt) ++{ ++ if (pt->prev) ++ pt->prev->next = pt->next; ++ else ++ pq->head = pt->next; ++ if (pt->next) ++ pt->next->prev = pt->prev; ++ else ++ pq->tail = pt->prev; ++ pt->next = NULL; ++ pt->prev = NULL; ++} ++ ++static void polltask_free(struct polltask * const pt) ++{ ++ sem_destroy(&pt->kill_sem); ++ free(pt); ++} ++ ++static int pollqueue_prod(const struct pollqueue *const pq) ++{ ++ static const uint64_t one = 1; ++ return write(pq->prod_fd, &one, sizeof(one)); ++} ++ ++void polltask_delete(struct polltask **const ppt) ++{ ++ struct polltask *const pt = *ppt; ++ struct pollqueue * pq; ++ enum polltask_state state; ++ bool prodme; ++ ++ if (!pt) ++ return; ++ ++ pq = pt->q; ++ pthread_mutex_lock(&pq->lock); ++ state = pt->state; ++ pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL; ++ prodme = !pq->no_prod; ++ pthread_mutex_unlock(&pq->lock); ++ ++ if (state != POLLTASK_UNQUEUED) { ++ if (prodme) ++ pollqueue_prod(pq); ++ while (sem_wait(&pt->kill_sem) && errno == EINTR) ++ /* loop */; ++ } ++ ++ // Leave zapping the ref until we have DQed the PT as might well be ++ // legitimately used in it ++ *ppt = NULL; ++ polltask_free(pt); ++ pollqueue_unref(&pq); ++} ++ ++static uint64_t pollqueue_now(int timeout) ++{ ++ struct timespec now; ++ uint64_t now_ms; ++ ++ if (clock_gettime(CLOCK_MONOTONIC, &now)) ++ return 0; ++ now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout; ++ return now_ms ? now_ms : (uint64_t)1; ++} ++ ++void pollqueue_add_task(struct polltask *const pt, const int timeout) ++{ ++ bool prodme = false; ++ struct pollqueue * const pq = pt->q; ++ ++ pthread_mutex_lock(&pq->lock); ++ if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) { ++ if (pq->tail) ++ pq->tail->next = pt; ++ else ++ pq->head = pt; ++ pt->prev = pq->tail; ++ pt->next = NULL; ++ pt->state = POLLTASK_QUEUED; ++ pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout); ++ pq->tail = pt; ++ prodme = !pq->no_prod; ++ } ++ pthread_mutex_unlock(&pq->lock); ++ if (prodme) ++ pollqueue_prod(pq); ++} ++ ++static void *poll_thread(void *v) ++{ ++ struct pollqueue *const pq = v; ++ struct pollfd *a = NULL; ++ size_t asize = 0; ++ ++ pthread_mutex_lock(&pq->lock); ++ do { ++ unsigned int i; ++ unsigned int n = 0; ++ struct polltask *pt; ++ struct polltask *pt_next; ++ uint64_t now = pollqueue_now(0); ++ int timeout = -1; ++ int rv; ++ ++ for (pt = pq->head; pt; pt = pt_next) { ++ int64_t t; ++ ++ pt_next = pt->next; ++ ++ if (pt->state == POLLTASK_Q_KILL) { ++ pollqueue_rem_task(pq, pt); ++ sem_post(&pt->kill_sem); ++ continue; ++ } ++ ++ if (n >= asize) { ++ asize = asize ? asize * 2 : 4; ++ a = realloc(a, asize * sizeof(*a)); ++ if (!a) { ++ request_log("Failed to realloc poll array to %zd\n", asize); ++ goto fail_locked; ++ } ++ } ++ ++ a[n++] = (struct pollfd){ ++ .fd = pt->fd, ++ .events = pt->events ++ }; ++ ++ t = (int64_t)(pt->timeout - now); ++ if (pt->timeout && t < INT_MAX && ++ (timeout < 0 || (int)t < timeout)) ++ timeout = (t < 0) ? 0 : (int)t; ++ } ++ pthread_mutex_unlock(&pq->lock); ++ ++ if ((rv = poll(a, n, timeout)) == -1) { ++ if (errno != EINTR) { ++ request_log("Poll error: %s\n", strerror(errno)); ++ goto fail_unlocked; ++ } ++ } ++ ++ pthread_mutex_lock(&pq->lock); ++ now = pollqueue_now(0); ++ ++ /* Prodding in this loop is pointless and might lead to ++ * infinite looping ++ */ ++ pq->no_prod = true; ++ for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) { ++ pt_next = pt->next; ++ ++ /* Pending? */ ++ if (a[i].revents || ++ (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) { ++ pollqueue_rem_task(pq, pt); ++ if (pt->state == POLLTASK_QUEUED) ++ pt->state = POLLTASK_RUNNING; ++ if (pt->state == POLLTASK_Q_KILL) ++ pt->state = POLLTASK_RUN_KILL; ++ pthread_mutex_unlock(&pq->lock); ++ ++ /* This can add new entries to the Q but as ++ * those are added to the tail our existing ++ * chain remains intact ++ */ ++ pt->fn(pt->v, a[i].revents); ++ ++ pthread_mutex_lock(&pq->lock); ++ if (pt->state == POLLTASK_RUNNING) ++ pt->state = POLLTASK_UNQUEUED; ++ if (pt->state == POLLTASK_RUN_KILL) ++ sem_post(&pt->kill_sem); ++ } ++ } ++ pq->no_prod = false; ++ ++ } while (!pq->kill); ++ ++fail_locked: ++ pthread_mutex_unlock(&pq->lock); ++fail_unlocked: ++ free(a); ++ return NULL; ++} ++ ++static void prod_fn(void *v, short revents) ++{ ++ struct pollqueue *const pq = v; ++ char buf[8]; ++ if (revents) ++ read(pq->prod_fd, buf, 8); ++ if (!pq->kill) ++ pollqueue_add_task(pq->prod_pt, -1); ++} ++ ++struct pollqueue * pollqueue_new(void) ++{ ++ struct pollqueue *pq = malloc(sizeof(*pq)); ++ if (!pq) ++ return NULL; ++ *pq = (struct pollqueue){ ++ .ref_count = ATOMIC_VAR_INIT(0), ++ .lock = PTHREAD_MUTEX_INITIALIZER, ++ .head = NULL, ++ .tail = NULL, ++ .kill = false, ++ .prod_fd = -1 ++ }; ++ ++ pq->prod_fd = eventfd(0, EFD_NONBLOCK); ++ if (pq->prod_fd == 1) ++ goto fail1; ++ pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq); ++ if (!pq->prod_pt) ++ goto fail2; ++ pollqueue_add_task(pq->prod_pt, -1); ++ if (pthread_create(&pq->worker, NULL, poll_thread, pq)) ++ goto fail3; ++ // Reset ref count which will have been inced by the add_task ++ atomic_store(&pq->ref_count, 0); ++ return pq; ++ ++fail3: ++ polltask_free(pq->prod_pt); ++fail2: ++ close(pq->prod_fd); ++fail1: ++ free(pq); ++ return NULL; ++} ++ ++static void pollqueue_free(struct pollqueue *const pq) ++{ ++ void *rv; ++ ++ pthread_mutex_lock(&pq->lock); ++ pq->kill = true; ++ pollqueue_prod(pq); ++ pthread_mutex_unlock(&pq->lock); ++ ++ pthread_join(pq->worker, &rv); ++ polltask_free(pq->prod_pt); ++ pthread_mutex_destroy(&pq->lock); ++ close(pq->prod_fd); ++ free(pq); ++} ++ ++struct pollqueue * pollqueue_ref(struct pollqueue *const pq) ++{ ++ atomic_fetch_add(&pq->ref_count, 1); ++ return pq; ++} ++ ++void pollqueue_unref(struct pollqueue **const ppq) ++{ ++ struct pollqueue * const pq = *ppq; ++ ++ if (!pq) ++ return; ++ *ppq = NULL; ++ ++ if (atomic_fetch_sub(&pq->ref_count, 1) != 0) ++ return; ++ ++ pollqueue_free(pq); ++} ++ ++ ++ +diff --git a/libavcodec/v4l2_req_pollqueue.h b/libavcodec/v4l2_req_pollqueue.h +new file mode 100644 +index 0000000000..e1182cb2fc +--- /dev/null ++++ b/libavcodec/v4l2_req_pollqueue.h +@@ -0,0 +1,18 @@ ++#ifndef POLLQUEUE_H_ ++#define POLLQUEUE_H_ ++ ++struct polltask; ++struct pollqueue; ++ ++struct polltask *polltask_new(struct pollqueue *const pq, ++ const int fd, const short events, ++ void (*const fn)(void *v, short revents), ++ void *const v); ++void polltask_delete(struct polltask **const ppt); ++ ++void pollqueue_add_task(struct polltask *const pt, const int timeout); ++struct pollqueue * pollqueue_new(void); ++void pollqueue_unref(struct pollqueue **const ppq); ++struct pollqueue * pollqueue_ref(struct pollqueue *const pq); ++ ++#endif /* POLLQUEUE_H_ */ +diff --git a/libavcodec/v4l2_req_utils.h b/libavcodec/v4l2_req_utils.h +new file mode 100644 +index 0000000000..a31cc1f4ec +--- /dev/null ++++ b/libavcodec/v4l2_req_utils.h +@@ -0,0 +1,27 @@ ++#ifndef AVCODEC_V4L2_REQ_UTILS_H ++#define AVCODEC_V4L2_REQ_UTILS_H ++ ++#include ++#include "libavutil/log.h" ++ ++#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__) ++ ++#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__) ++#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__) ++#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__) ++#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__) ++ ++static inline char safechar(char c) { ++ return c > 0x20 && c < 0x7f ? c : '.'; ++} ++ ++static inline const char * strfourcc(char tbuf[5], uint32_t fcc) { ++ tbuf[0] = safechar((fcc >> 0) & 0xff); ++ tbuf[1] = safechar((fcc >> 8) & 0xff); ++ tbuf[2] = safechar((fcc >> 16) & 0xff); ++ tbuf[3] = safechar((fcc >> 24) & 0xff); ++ tbuf[4] = '\0'; ++ return tbuf; ++} ++ ++#endif +diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c +new file mode 100644 +index 0000000000..fbec16a93e +--- /dev/null ++++ b/libavcodec/v4l2_request_hevc.c +@@ -0,0 +1,347 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ ++#include "config.h" ++#include "decode.h" ++#include "hevcdec.h" ++#include "hwconfig.h" ++ ++#include "v4l2_request_hevc.h" ++ ++#include "libavutil/hwcontext_drm.h" ++#include "libavutil/pixdesc.h" ++ ++#include "v4l2_req_devscan.h" ++#include "v4l2_req_dmabufs.h" ++#include "v4l2_req_pollqueue.h" ++#include "v4l2_req_media.h" ++#include "v4l2_req_utils.h" ++ ++static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8) ++{ ++ const size_t wxh = w * h; ++ size_t bits_alloc; ++ ++ /* Annex A gives a min compression of 2 @ lvl 3.1 ++ * (wxh <= 983040) and min 4 thereafter but avoid ++ * the odity of 983041 having a lower limit than ++ * 983040. ++ * Multiply by 3/2 for 4:2:0 ++ */ ++ bits_alloc = wxh < 983040 ? wxh * 3 / 4 : ++ wxh < 983040 * 2 ? 983040 * 3 / 4 : ++ wxh * 3 / 8; ++ /* Allow for bit depth */ ++ bits_alloc += (bits_alloc * bits_minus8) / 8; ++ /* Add a few bytes (16k) for overhead */ ++ bits_alloc += 0x4000; ++ return bits_alloc; ++} ++ ++static int v4l2_req_hevc_start_frame(AVCodecContext *avctx, ++ av_unused const uint8_t *buffer, ++ av_unused uint32_t size) ++{ ++ const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ return ctx->fns->start_frame(avctx, buffer, size); ++} ++ ++static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) ++{ ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ return ctx->fns->decode_slice(avctx, buffer, size); ++} ++ ++static int v4l2_req_hevc_end_frame(AVCodecContext *avctx) ++{ ++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; ++ return ctx->fns->end_frame(avctx); ++} ++ ++static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx) ++{ ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ ctx->fns->abort_frame(avctx); ++} ++ ++static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) ++{ ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ return ctx->fns->frame_params(avctx, hw_frames_ctx); ++} ++ ++static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame) ++{ ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ return ctx->fns->alloc_frame(avctx, frame); ++} ++ ++ ++static int v4l2_request_hevc_uninit(AVCodecContext *avctx) ++{ ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ ++ av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); ++ ++ decode_q_wait(&ctx->decode_q, NULL); // Wait for all other threads to be out of decode ++ ++ mediabufs_ctl_unref(&ctx->mbufs); ++ media_pool_delete(&ctx->mpool); ++ pollqueue_unref(&ctx->pq); ++ dmabufs_ctl_unref(&ctx->dbufs); ++ devscan_delete(&ctx->devscan); ++ ++ decode_q_uninit(&ctx->decode_q); ++ ++// if (avctx->hw_frames_ctx) { ++// AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data; ++// av_buffer_pool_flush(hwfc->pool); ++// } ++ return 0; ++} ++ ++static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc) ++{ ++ AVCodecContext *const avctx = v; ++ const HEVCContext *const h = avctx->priv_data; ++ ++ if (h->ps.sps->bit_depth == 8) { ++ if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 || ++ fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) { ++ return 1; ++ } ++ } ++ else if (h->ps.sps->bit_depth == 10) { ++ if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) { ++ return 1; ++ } ++ } ++ return 0; ++} ++ ++static int v4l2_request_hevc_init(AVCodecContext *avctx) ++{ ++ const HEVCContext *h = avctx->priv_data; ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ const HEVCSPS * const sps = h->ps.sps; ++ int ret; ++ const struct decdev * decdev; ++ const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 4).src_pix_fmt_v4l2; // Assuming constant for all APIs but avoiding V4L2 includes ++ size_t src_size; ++ enum mediabufs_memory src_memtype; ++ enum mediabufs_memory dst_memtype; ++ ++ av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); ++ ++ // Give up immediately if this is something that we have no code to deal with ++ if (h->ps.sps->chroma_format_idc != 1) { ++ av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc); ++ return AVERROR_PATCHWELCOME; ++ } ++ if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) || ++ h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) { ++ av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma); ++ return AVERROR_PATCHWELCOME; ++ } ++ ++ if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) { ++ av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n"); ++ return (AVERROR(-ret)); ++ } ++ ret = AVERROR(ENOMEM); // Assume mem fail by default for these ++ ++ if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL) ++ { ++ av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n"); ++ ret = AVERROR(ENODEV); ++ goto fail0; ++ } ++ av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n", ++ decdev_media_path(decdev), decdev_video_path(decdev)); ++ ++ if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) { ++ av_log(avctx, AV_LOG_DEBUG, "Unable to open dmabufs - try mmap buffers\n"); ++ src_memtype = MEDIABUFS_MEMORY_MMAP; ++ dst_memtype = MEDIABUFS_MEMORY_MMAP; ++ } ++ else { ++ av_log(avctx, AV_LOG_DEBUG, "Dmabufs opened - try dmabuf buffers\n"); ++ src_memtype = MEDIABUFS_MEMORY_DMABUF; ++ dst_memtype = MEDIABUFS_MEMORY_DMABUF; ++ } ++ ++ if ((ctx->pq = pollqueue_new()) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n"); ++ goto fail1; ++ } ++ ++ if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n"); ++ goto fail2; ++ } ++ ++ if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n"); ++ goto fail3; ++ } ++ ++ // Ask for an initial bitbuf size of max size / 4 ++ // We will realloc if we need more ++ // Must use sps->h/w as avctx contains cropped size ++retry_src_memtype: ++ src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8); ++ if (src_memtype == MEDIABUFS_MEMORY_DMABUF && mediabufs_src_resizable(ctx->mbufs)) ++ src_size /= 4; ++ // Kludge for conformance tests which break Annex A limits ++ else if (src_size < 0x40000) ++ src_size = 0x40000; ++ ++ if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt, ++ sps->width, sps->height, src_size)) { ++ char tbuf1[5]; ++ av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height); ++ goto fail4; ++ } ++ ++ if (mediabufs_src_chk_memtype(ctx->mbufs, src_memtype)) { ++ if (src_memtype == MEDIABUFS_MEMORY_DMABUF) { ++ src_memtype = MEDIABUFS_MEMORY_MMAP; ++ goto retry_src_memtype; ++ } ++ av_log(avctx, AV_LOG_ERROR, "Failed to get src memory type\n"); ++ goto fail4; ++ } ++ ++ if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n"); ++ ctx->fns = &V2(ff_v4l2_req_hevc, 4); ++ } ++#if CONFIG_V4L2_REQ_HEVC_VX ++ else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n"); ++ ctx->fns = &V2(ff_v4l2_req_hevc, 3); ++ } ++ else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n"); ++ ctx->fns = &V2(ff_v4l2_req_hevc, 2); ++ } ++ else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n"); ++ ctx->fns = &V2(ff_v4l2_req_hevc, 1); ++ } ++#endif ++ else { ++ av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n"); ++ ret = AVERROR(EINVAL); ++ goto fail4; ++ } ++ ++ if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) { ++ char tbuf1[5]; ++ av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height); ++ goto fail4; ++ } ++ ++ if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6, src_memtype)) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n"); ++ goto fail4; ++ } ++ ++ { ++ unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering + ++ avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6); ++ av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots, ++ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering, ++ avctx->thread_count, avctx->extra_hw_frames); ++ ++ if (mediabufs_dst_chk_memtype(ctx->mbufs, dst_memtype)) { ++ if (dst_memtype != MEDIABUFS_MEMORY_DMABUF) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to get dst memory type\n"); ++ goto fail4; ++ } ++ av_log(avctx, AV_LOG_DEBUG, "Dst DMABUF not supported - trying mmap\n"); ++ dst_memtype = MEDIABUFS_MEMORY_MMAP; ++ } ++ ++ // extra_hw_frames is -1 if unset ++ if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0), dst_memtype)) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n"); ++ goto fail4; ++ } ++ } ++ ++ if (mediabufs_stream_on(ctx->mbufs)) { ++ av_log(avctx, AV_LOG_ERROR, "Failed stream on\n"); ++ goto fail4; ++ } ++ ++ if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n"); ++ goto fail4; ++ } ++ ++ if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed set controls\n"); ++ goto fail5; ++ } ++ ++ decode_q_init(&ctx->decode_q); ++ ++ // Set our s/w format ++ avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format; ++ ++ av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s; swfmt=%s\n", ++ ctx->fns->name, ++ decdev_media_path(decdev), decdev_video_path(decdev), ++ mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype), ++ av_get_pix_fmt_name(avctx->sw_pix_fmt)); ++ ++ return 0; ++ ++fail5: ++ av_buffer_unref(&avctx->hw_frames_ctx); ++fail4: ++ mediabufs_ctl_unref(&ctx->mbufs); ++fail3: ++ media_pool_delete(&ctx->mpool); ++fail2: ++ pollqueue_unref(&ctx->pq); ++fail1: ++ dmabufs_ctl_unref(&ctx->dbufs); ++fail0: ++ devscan_delete(&ctx->devscan); ++ return ret; ++} ++ ++const AVHWAccel ff_hevc_v4l2request_hwaccel = { ++ .name = "hevc_v4l2request", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_HEVC, ++ .pix_fmt = AV_PIX_FMT_DRM_PRIME, ++ .alloc_frame = v4l2_req_hevc_alloc_frame, ++ .start_frame = v4l2_req_hevc_start_frame, ++ .decode_slice = v4l2_req_hevc_decode_slice, ++ .end_frame = v4l2_req_hevc_end_frame, ++ .abort_frame = v4l2_req_hevc_abort_frame, ++ .init = v4l2_request_hevc_init, ++ .uninit = v4l2_request_hevc_uninit, ++ .priv_data_size = sizeof(V4L2RequestContextHEVC), ++ .frame_params = v4l2_req_hevc_frame_params, ++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, ++}; +diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h +new file mode 100644 +index 0000000000..99c90064ea +--- /dev/null ++++ b/libavcodec/v4l2_request_hevc.h +@@ -0,0 +1,102 @@ ++#ifndef AVCODEC_V4L2_REQUEST_HEVC_H ++#define AVCODEC_V4L2_REQUEST_HEVC_H ++ ++#include ++#include ++#include "v4l2_req_decode_q.h" ++ ++#ifndef DRM_FORMAT_NV15 ++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') ++#endif ++ ++#ifndef DRM_FORMAT_NV20 ++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') ++#endif ++ ++// P030 should be defined in drm_fourcc.h and hopefully will be sometime ++// in the future but until then... ++#ifndef DRM_FORMAT_P030 ++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') ++#endif ++ ++#ifndef DRM_FORMAT_NV15 ++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') ++#endif ++ ++#ifndef DRM_FORMAT_NV20 ++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') ++#endif ++ ++#include ++#ifndef V4L2_CID_CODEC_BASE ++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE ++#endif ++ ++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined ++// in drm_fourcc.h hopefully will be sometime in the future but until then... ++#ifndef V4L2_PIX_FMT_NV12_10_COL128 ++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') ++#endif ++ ++#ifndef V4L2_PIX_FMT_NV12_COL128 ++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ ++#endif ++ ++#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY ++#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY 0x0800 ++#endif ++ ++#define VCAT(name, version) name##_v##version ++#define V2(n,v) VCAT(n, v) ++#define V(n) V2(n, HEVC_CTRLS_VERSION) ++ ++#define S2(x) #x ++#define STR(x) S2(x) ++ ++// 1 per decoder ++struct v4l2_req_decode_fns; ++ ++typedef struct V4L2RequestContextHEVC { ++// V4L2RequestContext base; ++ const struct v4l2_req_decode_fns * fns; ++ ++ unsigned int timestamp; // ?? maybe uint64_t ++ ++ int decode_mode; ++ int start_code; ++ unsigned int max_slices; // 0 => not wanted (frame mode) ++ unsigned int max_offsets; // 0 => not wanted ++ ++ req_decode_q decode_q; ++ ++ struct devscan *devscan; ++ struct dmabufs_ctl *dbufs; ++ struct pollqueue *pq; ++ struct media_pool * mpool; ++ struct mediabufs_ctl *mbufs; ++} V4L2RequestContextHEVC; ++ ++typedef struct v4l2_req_decode_fns { ++ int src_pix_fmt_v4l2; ++ const char * name; ++ ++ // Init setup ++ int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx); ++ int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx); ++ ++ // Passthrough of hwaccel fns ++ int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size); ++ int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size); ++ int (*end_frame)(AVCodecContext *avctx); ++ void (*abort_frame)(AVCodecContext *avctx); ++ int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx); ++ int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame); ++} v4l2_req_decode_fns; ++ ++ ++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1); ++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2); ++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3); ++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4); ++ ++#endif +diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c +index d4ceb60791..fb7f839c5e 100644 +--- a/libavcodec/vc1dec.c ++++ b/libavcodec/vc1dec.c +@@ -486,7 +486,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx) + size = next - start - 4; + if (size <= 0) + continue; +- buf2_size = vc1_unescape_buffer(start + 4, size, buf2); ++ buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); + init_get_bits(&gb, buf2, buf2_size * 8); + switch (AV_RB32(start)) { + case VC1_CODE_SEQHDR: +@@ -678,7 +678,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, + case VC1_CODE_FRAME: + if (avctx->hwaccel) + buf_start = start; +- buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); ++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); + break; + case VC1_CODE_FIELD: { + int buf_size3; +@@ -695,8 +695,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, + ret = AVERROR(ENOMEM); + goto err; + } +- buf_size3 = vc1_unescape_buffer(start + 4, size, +- slices[n_slices].buf); ++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, ++ slices[n_slices].buf); + init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, + buf_size3 << 3); + slices[n_slices].mby_start = avctx->coded_height + 31 >> 5; +@@ -707,7 +707,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, + break; + } + case VC1_CODE_ENTRYPOINT: /* it should be before frame data */ +- buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); ++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); + init_get_bits(&s->gb, buf2, buf_size2 * 8); + ff_vc1_decode_entry_point(avctx, v, &s->gb); + break; +@@ -724,8 +724,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, + ret = AVERROR(ENOMEM); + goto err; + } +- buf_size3 = vc1_unescape_buffer(start + 4, size, +- slices[n_slices].buf); ++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, ++ slices[n_slices].buf); + init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, + buf_size3 << 3); + slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9); +@@ -759,7 +759,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, + ret = AVERROR(ENOMEM); + goto err; + } +- buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); ++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); + init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, + buf_size3 << 3); + slices[n_slices].mby_start = s->mb_height + 1 >> 1; +@@ -768,9 +768,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, + n_slices1 = n_slices - 1; + n_slices++; + } +- buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2); ++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2); + } else { +- buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2); ++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2); + } + init_get_bits(&s->gb, buf2, buf_size2*8); + } else +diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c +index c25a6f3adf..10182786b3 100644 +--- a/libavcodec/vc1dsp.c ++++ b/libavcodec/vc1dsp.c +@@ -32,6 +32,7 @@ + #include "rnd_avg.h" + #include "vc1dsp.h" + #include "startcode.h" ++#include "vc1_common.h" + + /* Apply overlap transform to horizontal edge */ + static void vc1_v_overlap_c(uint8_t *src, int stride) +@@ -1028,6 +1029,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp) + #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */ + + dsp->startcode_find_candidate = ff_startcode_find_candidate_c; ++ dsp->vc1_unescape_buffer = vc1_unescape_buffer; + + if (ARCH_AARCH64) + ff_vc1dsp_init_aarch64(dsp); +diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h +index 75db62b1b4..e192b431be 100644 +--- a/libavcodec/vc1dsp.h ++++ b/libavcodec/vc1dsp.h +@@ -80,6 +80,9 @@ typedef struct VC1DSPContext { + * one or more further zero bytes and a one byte. + */ + int (*startcode_find_candidate)(const uint8_t *buf, int size); ++ ++ /* Copy a buffer, removing startcode emulation escape bytes as we go */ ++ int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst); + } VC1DSPContext; + + void ff_vc1dsp_init(VC1DSPContext* c); +diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c +new file mode 100644 +index 0000000000..5a79e89ed7 +--- /dev/null ++++ b/libavcodec/weak_link.c +@@ -0,0 +1,103 @@ ++#include ++#include ++#include ++#include "weak_link.h" ++ ++struct ff_weak_link_master { ++ atomic_int ref_count; /* 0 is single ref for easier atomics */ ++ pthread_rwlock_t lock; ++ void * ptr; ++}; ++ ++static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c) ++{ ++ return (struct ff_weak_link_master *)c; ++} ++ ++struct ff_weak_link_master * ff_weak_link_new(void * p) ++{ ++ struct ff_weak_link_master * w = malloc(sizeof(*w)); ++ if (!w) ++ return NULL; ++ atomic_init(&w->ref_count, 0); ++ w->ptr = p; ++ if (pthread_rwlock_init(&w->lock, NULL)) { ++ free(w); ++ return NULL; ++ } ++ return w; ++} ++ ++static void weak_link_do_unref(struct ff_weak_link_master * const w) ++{ ++ int n = atomic_fetch_sub(&w->ref_count, 1); ++ if (n) ++ return; ++ ++ pthread_rwlock_destroy(&w->lock); ++ free(w); ++} ++ ++// Unref & break link ++void ff_weak_link_break(struct ff_weak_link_master ** ppLink) ++{ ++ struct ff_weak_link_master * const w = *ppLink; ++ if (!w) ++ return; ++ ++ *ppLink = NULL; ++ pthread_rwlock_wrlock(&w->lock); ++ w->ptr = NULL; ++ pthread_rwlock_unlock(&w->lock); ++ ++ weak_link_do_unref(w); ++} ++ ++struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w) ++{ ++ if (!w) ++ return NULL; ++ atomic_fetch_add(&w->ref_count, 1); ++ return (struct ff_weak_link_client*)w; ++} ++ ++void ff_weak_link_unref(struct ff_weak_link_client ** ppLink) ++{ ++ struct ff_weak_link_master * const w = weak_link_x(*ppLink); ++ if (!w) ++ return; ++ ++ *ppLink = NULL; ++ weak_link_do_unref(w); ++} ++ ++void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink) ++{ ++ struct ff_weak_link_master * const w = weak_link_x(*ppLink); ++ ++ if (!w) ++ return NULL; ++ ++ if (pthread_rwlock_rdlock(&w->lock)) ++ goto broken; ++ ++ if (w->ptr) ++ return w->ptr; ++ ++ pthread_rwlock_unlock(&w->lock); ++ ++broken: ++ *ppLink = NULL; ++ weak_link_do_unref(w); ++ return NULL; ++} ++ ++// Ignores a NULL c (so can be on the return path of both broken & live links) ++void ff_weak_link_unlock(struct ff_weak_link_client * c) ++{ ++ struct ff_weak_link_master * const w = weak_link_x(c); ++ if (w) ++ pthread_rwlock_unlock(&w->lock); ++} ++ ++ +diff --git a/libavcodec/weak_link.h b/libavcodec/weak_link.h +new file mode 100644 +index 0000000000..415b6a27a0 +--- /dev/null ++++ b/libavcodec/weak_link.h +@@ -0,0 +1,23 @@ ++struct ff_weak_link_master; ++struct ff_weak_link_client; ++ ++struct ff_weak_link_master * ff_weak_link_new(void * p); ++void ff_weak_link_break(struct ff_weak_link_master ** ppLink); ++ ++struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w); ++void ff_weak_link_unref(struct ff_weak_link_client ** ppLink); ++ ++// Returns NULL if link broken - in this case it will also zap ++// *ppLink and unref the weak_link. ++// Returns NULL if *ppLink is NULL (so a link once broken stays broken) ++// ++// The above does mean that there is a race if this is called simultainiously ++// by two threads using the same weak_link_client (so don't do that) ++void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink); ++void ff_weak_link_unlock(struct ff_weak_link_client * c); ++ ++ ++ ++ ++ ++ +diff --git a/libavdevice/Makefile b/libavdevice/Makefile +index 0dfe47a1f4..ec7c7b4147 100644 +--- a/libavdevice/Makefile ++++ b/libavdevice/Makefile +@@ -47,6 +47,9 @@ OBJS-$(CONFIG_SNDIO_OUTDEV) += sndio_enc.o sndio.o + OBJS-$(CONFIG_V4L2_INDEV) += v4l2.o v4l2-common.o timefilter.o + OBJS-$(CONFIG_V4L2_OUTDEV) += v4l2enc.o v4l2-common.o + OBJS-$(CONFIG_VFWCAP_INDEV) += vfwcap.o ++OBJS-$(CONFIG_VOUT_DRM_OUTDEV) += drm_vout.o ++OBJS-$(CONFIG_VOUT_EGL_OUTDEV) += egl_vout.o ++OBJS-$(CONFIG_VOUT_RPI_OUTDEV) += rpi_vout.o + OBJS-$(CONFIG_XCBGRAB_INDEV) += xcbgrab.o + OBJS-$(CONFIG_XV_OUTDEV) += xv.o + +diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c +index 92b27a1d14..19d2a9de55 100644 +--- a/libavdevice/alldevices.c ++++ b/libavdevice/alldevices.c +@@ -53,6 +53,9 @@ extern AVOutputFormat ff_sndio_muxer; + extern AVInputFormat ff_v4l2_demuxer; + extern AVOutputFormat ff_v4l2_muxer; + extern AVInputFormat ff_vfwcap_demuxer; ++extern AVOutputFormat ff_vout_drm_muxer; ++extern AVOutputFormat ff_vout_egl_muxer; ++extern AVOutputFormat ff_vout_rpi_muxer; + extern AVInputFormat ff_xcbgrab_demuxer; + extern AVOutputFormat ff_xv_muxer; + +diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c +new file mode 100644 +index 0000000000..c7b90e6dd8 +--- /dev/null ++++ b/libavdevice/drm_vout.c +@@ -0,0 +1,680 @@ ++/* ++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ ++// *** This module is a work in progress and its utility is strictly ++// limited to testing. ++ ++#include "libavutil/opt.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/hwcontext_drm.h" ++#include "libavformat/internal.h" ++#include "avdevice.h" ++ ++#include "pthread.h" ++#include ++#include ++ ++#include ++#include ++#include ++ ++#define TRACE_ALL 0 ++ ++#define DRM_MODULE "vc4" ++ ++#define ERRSTR strerror(errno) ++ ++struct drm_setup { ++ int conId; ++ uint32_t crtcId; ++ int crtcIdx; ++ uint32_t planeId; ++ unsigned int out_fourcc; ++ struct { ++ int x, y, width, height; ++ } compose; ++}; ++ ++typedef struct drm_aux_s { ++ unsigned int fb_handle; ++ uint32_t bo_handles[AV_DRM_MAX_PLANES]; ++ AVFrame * frame; ++} drm_aux_t; ++ ++// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS ++// we get initial flicker probably due to dodgy drm timing ++#define AUX_SIZE 3 ++typedef struct drm_display_env_s ++{ ++ AVClass *class; ++ ++ int drm_fd; ++ uint32_t con_id; ++ struct drm_setup setup; ++ enum AVPixelFormat avfmt; ++ ++ int show_all; ++ const char * drm_module; ++ ++ unsigned int ano; ++ drm_aux_t aux[AUX_SIZE]; ++ ++ pthread_t q_thread; ++ sem_t q_sem_in; ++ sem_t q_sem_out; ++ int q_terminate; ++ AVFrame * q_next; ++ ++} drm_display_env_t; ++ ++ ++static int drm_vout_write_trailer(AVFormatContext *s) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "%s\n", __func__); ++#endif ++ ++ return 0; ++} ++ ++static int drm_vout_write_header(AVFormatContext *s) ++{ ++ const AVCodecParameters * const par = s->streams[0]->codecpar; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "%s\n", __func__); ++#endif ++ if ( s->nb_streams > 1 ++ || par->codec_type != AVMEDIA_TYPE_VIDEO ++ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { ++ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ return 0; ++} ++ ++static int find_plane(struct AVFormatContext * const avctx, ++ const int drmfd, const int crtcidx, const uint32_t format, ++ uint32_t * const pplane_id) ++{ ++ drmModePlaneResPtr planes; ++ drmModePlanePtr plane; ++ drmModeObjectPropertiesPtr props = NULL; ++ drmModePropertyPtr prop = NULL; ++ unsigned int i; ++ unsigned int j; ++ int ret = -1; ++ ++ planes = drmModeGetPlaneResources(drmfd); ++ if (!planes) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR); ++ return -1; ++ } ++ ++ for (i = 0; i < planes->count_planes; ++i) { ++ plane = drmModeGetPlane(drmfd, planes->planes[i]); ++ if (!planes) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR); ++ break; ++ } ++ ++ if (!(plane->possible_crtcs & (1 << crtcidx))) { ++ drmModeFreePlane(plane); ++ continue; ++ } ++ ++ for (j = 0; j < plane->count_formats; ++j) { ++ if (plane->formats[j] == format) ++ break; ++ } ++ ++ if (j == plane->count_formats) { ++ drmModeFreePlane(plane); ++ continue; ++ } ++ ++ *pplane_id = plane->plane_id; ++ drmModeFreePlane(plane); ++ break; ++ } ++ ++ if (i == planes->count_planes) { ++ ret = -1; ++ goto fail; ++ } ++ ++ props = drmModeObjectGetProperties(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE); ++ if (!props) ++ goto fail; ++ for (i = 0; i != props->count_props; ++i) { ++ if (prop) ++ drmModeFreeProperty(prop); ++ prop = drmModeGetProperty(drmfd, props->props[i]); ++ if (!prop) ++ goto fail; ++ if (strcmp("zpos", prop->name) == 0) { ++ if (drmModeObjectSetProperty(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE, props->props[i], prop->values[1]) == 0) ++ av_log(avctx, AV_LOG_DEBUG, "ZPOS set to %d\n", (int)prop->values[1]); ++ else ++ av_log(avctx, AV_LOG_WARNING, "Failed to set ZPOS on DRM plane\n"); ++ break; ++ } ++ } ++ ++ ret = 0; ++fail: ++ if (props) ++ drmModeFreeObjectProperties(props); ++ if (prop) ++ drmModeFreeProperty(prop); ++ drmModeFreePlaneResources(planes); ++ return ret; ++} ++ ++static void da_uninit(drm_display_env_t * const de, drm_aux_t * da) ++{ ++ if (da->fb_handle != 0) { ++ drmModeRmFB(de->drm_fd, da->fb_handle); ++ da->fb_handle = 0; ++ } ++ ++ for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) { ++ if (da->bo_handles[i]) { ++ struct drm_gem_close gem_close = {.handle = da->bo_handles[i]}; ++ drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close); ++ da->bo_handles[i] = 0; ++ } ++ } ++ av_frame_free(&da->frame); ++} ++ ++static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame) ++{ ++ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0]; ++ drm_aux_t * da = de->aux + de->ano; ++ const uint32_t format = desc->layers[0].format; ++ int ret = 0; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd); ++#endif ++ ++ if (de->setup.out_fourcc != format) { ++ if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) { ++ av_frame_free(&frame); ++ av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format); ++ return -1; ++ } ++ de->setup.out_fourcc = format; ++ } ++ ++ { ++ drmVBlank vbl = { ++ .request = { ++ .type = DRM_VBLANK_RELATIVE, ++ .sequence = 0 ++ } ++ }; ++ ++ while (drmWaitVBlank(de->drm_fd, &vbl)) { ++ if (errno != EINTR) { ++// av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR); ++ break; ++ } ++ } ++ } ++ ++ da_uninit(de, da); ++ ++ { ++ uint32_t pitches[4] = {0}; ++ uint32_t offsets[4] = {0}; ++ uint64_t modifiers[4] = {0}; ++ uint32_t bo_handles[4] = {0}; ++ int has_mods = 0; ++ int i, j, n; ++ ++ da->frame = frame; ++ ++ for (i = 0; i < desc->nb_objects; ++i) { ++ if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) { ++ av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR); ++ return -1; ++ } ++ if (desc->objects[i].format_modifier != DRM_FORMAT_MOD_LINEAR && ++ desc->objects[i].format_modifier != DRM_FORMAT_MOD_INVALID) ++ has_mods = 1; ++ } ++ ++ n = 0; ++ for (i = 0; i < desc->nb_layers; ++i) { ++ for (j = 0; j < desc->layers[i].nb_planes; ++j) { ++ const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j; ++ const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index; ++ pitches[n] = p->pitch; ++ offsets[n] = p->offset; ++ modifiers[n] = obj->format_modifier; ++ bo_handles[n] = da->bo_handles[p->object_index]; ++ ++n; ++ } ++ } ++ ++#if 1 && TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d," ++ " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n", ++ av_frame_cropped_width(frame), ++ av_frame_cropped_height(frame), ++ desc->layers[0].format, ++ bo_handles[0], ++ bo_handles[1], ++ bo_handles[2], ++ bo_handles[3], ++ pitches[0], ++ pitches[1], ++ pitches[2], ++ pitches[3], ++ offsets[0], ++ offsets[1], ++ offsets[2], ++ offsets[3], ++ (long long)modifiers[0], ++ (long long)modifiers[1], ++ (long long)modifiers[2], ++ (long long)modifiers[3] ++ ); ++#endif ++ ++ if (drmModeAddFB2WithModifiers(de->drm_fd, ++ av_frame_cropped_width(frame), ++ av_frame_cropped_height(frame), ++ desc->layers[0].format, bo_handles, ++ pitches, offsets, ++ has_mods ? modifiers : NULL, ++ &da->fb_handle, ++ has_mods ? DRM_MODE_FB_MODIFIERS : 0) != 0) { ++ av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR); ++ return -1; ++ } ++ } ++ ++ ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId, ++ da->fb_handle, 0, ++ de->setup.compose.x, de->setup.compose.y, ++ de->setup.compose.width, ++ de->setup.compose.height, ++ 0, 0, ++ av_frame_cropped_width(frame) << 16, ++ av_frame_cropped_height(frame) << 16); ++ ++ if (ret != 0) { ++ av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR); ++ } ++ ++ de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1; ++ ++ return ret; ++} ++ ++static int do_sem_wait(sem_t * const sem, const int nowait) ++{ ++ while (nowait ? sem_trywait(sem) : sem_wait(sem)) { ++ if (errno != EINTR) ++ return -errno; ++ } ++ return 0; ++} ++ ++static void * display_thread(void * v) ++{ ++ AVFormatContext * const s = v; ++ drm_display_env_t * const de = s->priv_data; ++ int i; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); ++#endif ++ ++ sem_post(&de->q_sem_out); ++ ++ for (;;) { ++ AVFrame * frame; ++ ++ do_sem_wait(&de->q_sem_in, 0); ++ ++ if (de->q_terminate) ++ break; ++ ++ frame = de->q_next; ++ de->q_next = NULL; ++ sem_post(&de->q_sem_out); ++ ++ do_display(s, de, frame); ++ } ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); ++#endif ++ ++ for (i = 0; i != AUX_SIZE; ++i) ++ da_uninit(de, de->aux + i); ++ ++ av_frame_free(&de->q_next); ++ ++ return NULL; ++} ++ ++static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt) ++{ ++ const AVFrame * const src_frame = (AVFrame *)pkt->data; ++ AVFrame * frame; ++ drm_display_env_t * const de = s->priv_data; ++ int ret; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "%s\n", __func__); ++#endif ++ ++ if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) { ++ av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts); ++ return 0; ++ } ++ ++ if (src_frame->format == AV_PIX_FMT_DRM_PRIME) { ++ frame = av_frame_alloc(); ++ av_frame_ref(frame, src_frame); ++ } ++ else if (src_frame->format == AV_PIX_FMT_VAAPI) { ++ frame = av_frame_alloc(); ++ frame->format = AV_PIX_FMT_DRM_PRIME; ++ if (av_hwframe_map(frame, src_frame, 0) != 0) ++ { ++ av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format); ++ av_frame_free(&frame); ++ return AVERROR(EINVAL); ++ } ++ } ++ else { ++ av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format); ++ return AVERROR(EINVAL); ++ } ++ ++ ret = do_sem_wait(&de->q_sem_out, !de->show_all); ++ if (ret) { ++ av_frame_free(&frame); ++ } ++ else { ++ de->q_next = frame; ++ sem_post(&de->q_sem_in); ++ } ++ ++ return 0; ++} ++ ++static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, ++ unsigned flags) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags); ++#endif ++ ++ /* drm_vout_write_header() should have accepted only supported formats */ ++ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY)) ++ return 0; ++ ++ return 0; ++} ++ ++static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type); ++#endif ++ switch(type) { ++ case AV_APP_TO_DEV_WINDOW_REPAINT: ++ return 0; ++ default: ++ break; ++ } ++ return AVERROR(ENOSYS); ++} ++ ++static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId) ++{ ++ int ret = -1; ++ int i; ++ drmModeRes *res = drmModeGetResources(drmfd); ++ drmModeConnector *c; ++ ++ if(!res) ++ { ++ printf( "drmModeGetResources failed: %s\n", ERRSTR); ++ return -1; ++ } ++ ++ if (res->count_crtcs <= 0) ++ { ++ printf( "drm: no crts\n"); ++ goto fail_res; ++ } ++ ++ if (!s->conId) { ++ fprintf(stderr, ++ "No connector ID specified. Choosing default from list:\n"); ++ ++ for (i = 0; i < res->count_connectors; i++) { ++ drmModeConnector *con = ++ drmModeGetConnector(drmfd, res->connectors[i]); ++ drmModeEncoder *enc = NULL; ++ drmModeCrtc *crtc = NULL; ++ ++ if (con->encoder_id) { ++ enc = drmModeGetEncoder(drmfd, con->encoder_id); ++ if (enc->crtc_id) { ++ crtc = drmModeGetCrtc(drmfd, enc->crtc_id); ++ } ++ } ++ ++ if (!s->conId && crtc) { ++ s->conId = con->connector_id; ++ s->crtcId = crtc->crtc_id; ++ } ++ ++ av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n", ++ con->connector_id, ++ crtc ? crtc->crtc_id : 0, ++ con->connector_type, ++ crtc ? crtc->width : 0, ++ crtc ? crtc->height : 0, ++ (s->conId == (int)con->connector_id ? ++ " (chosen)" : "")); ++ } ++ ++ if (!s->conId) { ++ av_log(avctx, AV_LOG_ERROR, ++ "No suitable enabled connector found.\n"); ++ return -1;; ++ } ++ } ++ ++ s->crtcIdx = -1; ++ ++ for (i = 0; i < res->count_crtcs; ++i) { ++ if (s->crtcId == res->crtcs[i]) { ++ s->crtcIdx = i; ++ break; ++ } ++ } ++ ++ if (s->crtcIdx == -1) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId); ++ goto fail_res; ++ } ++ ++ if (res->count_connectors <= 0) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n"); ++ goto fail_res; ++ } ++ ++ c = drmModeGetConnector(drmfd, s->conId); ++ if (!c) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR); ++ goto fail_res; ++ } ++ ++ if (!c->count_modes) ++ { ++ av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n"); ++ goto fail_conn; ++ } ++ ++ { ++ drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId); ++ s->compose.x = crtc->x; ++ s->compose.y = crtc->y; ++ s->compose.width = crtc->width; ++ s->compose.height = crtc->height; ++ drmModeFreeCrtc(crtc); ++ } ++ ++ if (pConId) ++ *pConId = c->connector_id; ++ ret = 0; ++ ++fail_conn: ++ drmModeFreeConnector(c); ++ ++fail_res: ++ drmModeFreeResources(res); ++ ++ return ret; ++} ++ ++// deinit is called if init fails so no need to clean up explicity here ++static int drm_vout_init(struct AVFormatContext * s) ++{ ++ drm_display_env_t * const de = s->priv_data; ++ int rv; ++ ++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); ++ ++ de->drm_fd = -1; ++ de->con_id = 0; ++ de->setup = (struct drm_setup){0}; ++ de->q_terminate = 0; ++ ++ if ((de->drm_fd = drmOpen(de->drm_module, NULL)) < 0) ++ { ++ rv = AVERROR(errno); ++ av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", de->drm_module, av_err2str(rv)); ++ return rv; ++ } ++ ++ if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0) ++ { ++ av_log(s, AV_LOG_ERROR, "failed to find valid mode\n"); ++ rv = AVERROR(EINVAL); ++ goto fail_close; ++ } ++ ++ sem_init(&de->q_sem_in, 0, 0); ++ sem_init(&de->q_sem_out, 0, 0); ++ if (pthread_create(&de->q_thread, NULL, display_thread, s)) { ++ rv = AVERROR(errno); ++ av_log(s, AV_LOG_ERROR, "Failed to create display thread: %s\n", av_err2str(rv)); ++ goto fail_close; ++ } ++ ++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); ++ ++ return 0; ++ ++fail_close: ++ close(de->drm_fd); ++ de->drm_fd = -1; ++ av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__); ++ ++ return rv; ++} ++ ++static void drm_vout_deinit(struct AVFormatContext * s) ++{ ++ drm_display_env_t * const de = s->priv_data; ++ ++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); ++ ++ de->q_terminate = 1; ++ sem_post(&de->q_sem_in); ++ pthread_join(de->q_thread, NULL); ++ sem_destroy(&de->q_sem_in); ++ sem_destroy(&de->q_sem_out); ++ ++ for (unsigned int i = 0; i != AUX_SIZE; ++i) ++ da_uninit(de, de->aux + i); ++ ++ av_frame_free(&de->q_next); ++ ++ if (de->drm_fd >= 0) { ++ close(de->drm_fd); ++ de->drm_fd = -1; ++ } ++ ++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); ++} ++ ++ ++#define OFFSET(x) offsetof(drm_display_env_t, x) ++static const AVOption options[] = { ++ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "drm_module", "drm_module name to use, default=" DRM_MODULE, OFFSET(drm_module), AV_OPT_TYPE_STRING, { .str = DRM_MODULE }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, ++ { NULL } ++}; ++ ++static const AVClass drm_vout_class = { ++ .class_name = "drm vid outdev", ++ .item_name = av_default_item_name, ++ .option = options, ++ .version = LIBAVUTIL_VERSION_INT, ++ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, ++}; ++ ++AVOutputFormat ff_vout_drm_muxer = { ++ .name = "vout_drm", ++ .long_name = NULL_IF_CONFIG_SMALL("Drm video output device"), ++ .priv_data_size = sizeof(drm_display_env_t), ++ .audio_codec = AV_CODEC_ID_NONE, ++ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, ++ .write_header = drm_vout_write_header, ++ .write_packet = drm_vout_write_packet, ++ .write_uncoded_frame = drm_vout_write_frame, ++ .write_trailer = drm_vout_write_trailer, ++ .control_message = drm_vout_control_message, ++ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, ++ .priv_class = &drm_vout_class, ++ .init = drm_vout_init, ++ .deinit = drm_vout_deinit, ++}; ++ +diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c +new file mode 100644 +index 0000000000..cc6e310551 +--- /dev/null ++++ b/libavdevice/egl_vout.c +@@ -0,0 +1,788 @@ ++/* ++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ ++// *** This module is a work in progress and its utility is strictly ++// limited to testing. ++// Amongst other issues it doesn't wait for the pic to be displayed before ++// returning the buffer so flikering does occur. ++ ++#include ++#include ++ ++#include "libavutil/opt.h" ++#include "libavutil/avassert.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/imgutils.h" ++#include "libavutil/hwcontext_drm.h" ++#include "libavformat/internal.h" ++#include "avdevice.h" ++ ++#include "pthread.h" ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "libavutil/rpi_sand_fns.h" ++ ++#define TRACE_ALL 0 ++ ++struct egl_setup { ++ int conId; ++ ++ Display *dpy; ++ EGLDisplay egl_dpy; ++ EGLContext ctx; ++ EGLSurface surf; ++ Window win; ++ ++ uint32_t crtcId; ++ int crtcIdx; ++ uint32_t planeId; ++ struct { ++ int x, y, width, height; ++ } compose; ++}; ++ ++typedef struct egl_aux_s { ++ int fd; ++ GLuint texture; ++ ++} egl_aux_t; ++ ++typedef struct egl_display_env_s { ++ AVClass *class; ++ ++ struct egl_setup setup; ++ enum AVPixelFormat avfmt; ++ ++ int show_all; ++ int window_width, window_height; ++ int window_x, window_y; ++ int fullscreen; ++ ++ egl_aux_t aux[32]; ++ ++ pthread_t q_thread; ++ pthread_mutex_t q_lock; ++ sem_t display_start_sem; ++ sem_t q_sem; ++ int q_terminate; ++ AVFrame *q_this; ++ AVFrame *q_next; ++ ++} egl_display_env_t; ++ ++ ++/** ++ * Remove window border/decorations. ++ */ ++static void ++no_border(Display *dpy, Window w) ++{ ++ static const unsigned MWM_HINTS_DECORATIONS = (1 << 1); ++ static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5; ++ ++ typedef struct { ++ unsigned long flags; ++ unsigned long functions; ++ unsigned long decorations; ++ long inputMode; ++ unsigned long status; ++ } PropMotifWmHints; ++ ++ PropMotifWmHints motif_hints; ++ Atom prop, proptype; ++ unsigned long flags = 0; ++ ++ /* setup the property */ ++ motif_hints.flags = MWM_HINTS_DECORATIONS; ++ motif_hints.decorations = flags; ++ ++ /* get the atom for the property */ ++ prop = XInternAtom(dpy, "_MOTIF_WM_HINTS", True); ++ if (!prop) { ++ /* something went wrong! */ ++ return; ++ } ++ ++ /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */ ++ proptype = prop; ++ ++ XChangeProperty(dpy, w, /* display, window */ ++ prop, proptype, /* property, type */ ++ 32, /* format: 32-bit datums */ ++ PropModeReplace, /* mode */ ++ (unsigned char *)&motif_hints, /* data */ ++ PROP_MOTIF_WM_HINTS_ELEMENTS /* nelements */ ++ ); ++} ++ ++ ++/* ++ * Create an RGB, double-buffered window. ++ * Return the window and context handles. ++ */ ++static int ++make_window(struct AVFormatContext *const s, ++ egl_display_env_t *const de, ++ Display *dpy, EGLDisplay egl_dpy, const char *name, ++ Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet) ++{ ++ int scrnum = DefaultScreen(dpy); ++ XSetWindowAttributes attr; ++ unsigned long mask; ++ Window root = RootWindow(dpy, scrnum); ++ Window win; ++ EGLContext ctx; ++ const int fullscreen = de->fullscreen; ++ EGLConfig config; ++ int x = de->window_x; ++ int y = de->window_y; ++ int width = de->window_width ? de->window_width : 1280; ++ int height = de->window_height ? de->window_height : 720; ++ ++ ++ if (fullscreen) { ++ int scrnum = DefaultScreen(dpy); ++ ++ x = 0; y = 0; ++ width = DisplayWidth(dpy, scrnum); ++ height = DisplayHeight(dpy, scrnum); ++ } ++ ++ { ++ EGLint num_configs; ++ static const EGLint attribs[] = { ++ EGL_RED_SIZE, 1, ++ EGL_GREEN_SIZE, 1, ++ EGL_BLUE_SIZE, 1, ++ EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, ++ EGL_NONE ++ }; ++ ++ if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) { ++ av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n"); ++ return -1; ++ } ++ } ++ ++ { ++ EGLint vid; ++ if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) { ++ av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n"); ++ return -1; ++ } ++ ++ { ++ XVisualInfo visTemplate = { ++ .visualid = vid, ++ }; ++ int num_visuals; ++ XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask, ++ &visTemplate, &num_visuals); ++ ++ /* window attributes */ ++ attr.background_pixel = 0; ++ attr.border_pixel = 0; ++ attr.colormap = XCreateColormap(dpy, root, visinfo->visual, AllocNone); ++ attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask; ++ /* XXX this is a bad way to get a borderless window! */ ++ mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask; ++ ++ win = XCreateWindow(dpy, root, x, y, width, height, ++ 0, visinfo->depth, InputOutput, ++ visinfo->visual, mask, &attr); ++ XFree(visinfo); ++ } ++ } ++ ++ if (fullscreen) ++ no_border(dpy, win); ++ ++ /* set hints and properties */ ++ { ++ XSizeHints sizehints; ++ sizehints.x = x; ++ sizehints.y = y; ++ sizehints.width = width; ++ sizehints.height = height; ++ sizehints.flags = USSize | USPosition; ++ XSetNormalHints(dpy, win, &sizehints); ++ XSetStandardProperties(dpy, win, name, name, ++ None, (char **)NULL, 0, &sizehints); ++ } ++ ++ eglBindAPI(EGL_OPENGL_ES_API); ++ ++ { ++ static const EGLint ctx_attribs[] = { ++ EGL_CONTEXT_CLIENT_VERSION, 2, ++ EGL_NONE ++ }; ++ ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs); ++ if (!ctx) { ++ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); ++ return -1; ++ } ++ } ++ ++ ++ XMapWindow(dpy, win); ++ ++ { ++ EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL); ++ if (!surf) { ++ av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n"); ++ return -1; ++ } ++ ++ if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) { ++ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); ++ return -1; ++ } ++ ++ *winRet = win; ++ *ctxRet = ctx; ++ *surfRet = surf; ++ } ++ ++ return 0; ++} ++ ++static GLint ++compile_shader(struct AVFormatContext *const avctx, GLenum target, const char *source) ++{ ++ GLuint s = glCreateShader(target); ++ ++ if (s == 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n"); ++ return 0; ++ } ++ ++ glShaderSource(s, 1, (const GLchar **)&source, NULL); ++ glCompileShader(s); ++ ++ { ++ GLint ok; ++ glGetShaderiv(s, GL_COMPILE_STATUS, &ok); ++ ++ if (!ok) { ++ GLchar *info; ++ GLint size; ++ ++ glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size); ++ info = malloc(size); ++ ++ glGetShaderInfoLog(s, size, NULL, info); ++ av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source); ++ ++ return 0; ++ } ++ } ++ ++ return s; ++} ++ ++static GLuint link_program(struct AVFormatContext *const s, GLint vs, GLint fs) ++{ ++ GLuint prog = glCreateProgram(); ++ ++ if (prog == 0) { ++ av_log(s, AV_LOG_ERROR, "Failed to create program\n"); ++ return 0; ++ } ++ ++ glAttachShader(prog, vs); ++ glAttachShader(prog, fs); ++ glLinkProgram(prog); ++ ++ { ++ GLint ok; ++ glGetProgramiv(prog, GL_LINK_STATUS, &ok); ++ if (!ok) { ++ /* Some drivers return a size of 1 for an empty log. This is the size ++ * of a log that contains only a terminating NUL character. ++ */ ++ GLint size; ++ GLchar *info = NULL; ++ glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size); ++ if (size > 1) { ++ info = malloc(size); ++ glGetProgramInfoLog(prog, size, NULL, info); ++ } ++ ++ av_log(s, AV_LOG_ERROR, "Failed to link: %s\n", ++ (info != NULL) ? info : ""); ++ return 0; ++ } ++ } ++ ++ return prog; ++} ++ ++static int ++gl_setup(struct AVFormatContext *const s) ++{ ++ const char *vs = ++ "attribute vec4 pos;\n" ++ "varying vec2 texcoord;\n" ++ "\n" ++ "void main() {\n" ++ " gl_Position = pos;\n" ++ " texcoord.x = (pos.x + 1.0) / 2.0;\n" ++ " texcoord.y = (-pos.y + 1.0) / 2.0;\n" ++ "}\n"; ++ const char *fs = ++ "#extension GL_OES_EGL_image_external : enable\n" ++ "precision mediump float;\n" ++ "uniform samplerExternalOES s;\n" ++ "varying vec2 texcoord;\n" ++ "void main() {\n" ++ " gl_FragColor = texture2D(s, texcoord);\n" ++ "}\n"; ++ ++ GLuint vs_s; ++ GLuint fs_s; ++ GLuint prog; ++ ++ if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) || ++ !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) || ++ !(prog = link_program(s, vs_s, fs_s))) ++ return -1; ++ ++ glUseProgram(prog); ++ ++ { ++ static const float verts[] = { ++ -1, -1, ++ 1, -1, ++ 1, 1, ++ -1, 1, ++ }; ++ glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts); ++ } ++ ++ glEnableVertexAttribArray(0); ++ return 0; ++} ++ ++static int egl_vout_write_trailer(AVFormatContext *s) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ ++ return 0; ++} ++ ++static int egl_vout_write_header(AVFormatContext *s) ++{ ++ const AVCodecParameters *const par = s->streams[0]->codecpar; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ if (s->nb_streams > 1 ++ || par->codec_type != AVMEDIA_TYPE_VIDEO ++ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { ++ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ return 0; ++} ++ ++ ++static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVFrame *const frame) ++{ ++ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor *)frame->data[0]; ++ egl_aux_t *da = NULL; ++ unsigned int i; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); ++#endif ++ ++ for (i = 0; i != 32; ++i) { ++ if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) { ++ da = de->aux + i; ++ break; ++ } ++ } ++ ++ if (da == NULL) { ++ av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__); ++ return AVERROR(EINVAL); ++ } ++ ++ if (da->texture == 0) { ++ EGLint attribs[50]; ++ EGLint *a = attribs; ++ int i, j; ++ static const EGLint anames[] = { ++ EGL_DMA_BUF_PLANE0_FD_EXT, ++ EGL_DMA_BUF_PLANE0_OFFSET_EXT, ++ EGL_DMA_BUF_PLANE0_PITCH_EXT, ++ EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT, ++ EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT, ++ EGL_DMA_BUF_PLANE1_FD_EXT, ++ EGL_DMA_BUF_PLANE1_OFFSET_EXT, ++ EGL_DMA_BUF_PLANE1_PITCH_EXT, ++ EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT, ++ EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT, ++ EGL_DMA_BUF_PLANE2_FD_EXT, ++ EGL_DMA_BUF_PLANE2_OFFSET_EXT, ++ EGL_DMA_BUF_PLANE2_PITCH_EXT, ++ EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT, ++ EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT, ++ }; ++ const EGLint *b = anames; ++ ++ *a++ = EGL_WIDTH; ++ *a++ = av_frame_cropped_width(frame); ++ *a++ = EGL_HEIGHT; ++ *a++ = av_frame_cropped_height(frame); ++ *a++ = EGL_LINUX_DRM_FOURCC_EXT; ++ *a++ = desc->layers[0].format; ++ ++ for (i = 0; i < desc->nb_layers; ++i) { ++ for (j = 0; j < desc->layers[i].nb_planes; ++j) { ++ const AVDRMPlaneDescriptor *const p = desc->layers[i].planes + j; ++ const AVDRMObjectDescriptor *const obj = desc->objects + p->object_index; ++ *a++ = *b++; ++ *a++ = obj->fd; ++ *a++ = *b++; ++ *a++ = p->offset; ++ *a++ = *b++; ++ *a++ = p->pitch; ++ if (obj->format_modifier == 0) { ++ b += 2; ++ } ++ else { ++ *a++ = *b++; ++ *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF); ++ *a++ = *b++; ++ *a++ = (EGLint)(obj->format_modifier >> 32); ++ } ++ } ++ } ++ ++ *a = EGL_NONE; ++ ++#if TRACE_ALL ++ for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) { ++ av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]); ++ } ++#endif ++ { ++ const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy, ++ EGL_NO_CONTEXT, ++ EGL_LINUX_DMA_BUF_EXT, ++ NULL, attribs); ++ if (!image) { ++ av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd); ++ return -1; ++ } ++ ++ glGenTextures(1, &da->texture); ++ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); ++ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR); ++ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR); ++ glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image); ++ ++ eglDestroyImageKHR(de->setup.egl_dpy, image); ++ } ++ ++ da->fd = desc->objects[0].fd; ++ } ++ ++ glClearColor(0.5, 0.5, 0.5, 0.5); ++ glClear(GL_COLOR_BUFFER_BIT); ++ ++ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); ++ glDrawArrays(GL_TRIANGLE_FAN, 0, 4); ++ eglSwapBuffers(de->setup.egl_dpy, de->setup.surf); ++ ++ glDeleteTextures(1, &da->texture); ++ da->texture = 0; ++ da->fd = -1; ++ ++ return 0; ++} ++ ++static void* display_thread(void *v) ++{ ++ AVFormatContext *const s = v; ++ egl_display_env_t *const de = s->priv_data; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); ++#endif ++ { ++ EGLint egl_major, egl_minor; ++ ++ de->setup.dpy = XOpenDisplay(NULL); ++ if (!de->setup.dpy) { ++ av_log(s, AV_LOG_ERROR, "Couldn't open X display\n"); ++ goto fail; ++ } ++ ++ de->setup.egl_dpy = eglGetDisplay(de->setup.dpy); ++ if (!de->setup.egl_dpy) { ++ av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n"); ++ goto fail; ++ } ++ ++ if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) { ++ av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n"); ++ goto fail; ++ } ++ ++ av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor); ++ ++ if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) { ++ av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n"); ++ goto fail; ++ } ++ } ++ ++ if (!de->window_width || !de->window_height) { ++ de->window_width = 1280; ++ de->window_height = 720; ++ } ++ if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout", ++ &de->setup.win, &de->setup.ctx, &de->setup.surf)) { ++ av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__); ++ goto fail; ++ } ++ ++ if (gl_setup(s)) { ++ av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__); ++ goto fail; ++ } ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__); ++#endif ++ sem_post(&de->display_start_sem); ++ ++ for (;;) { ++ AVFrame *frame; ++ ++ while (sem_wait(&de->q_sem) != 0) { ++ av_assert0(errno == EINTR); ++ } ++ ++ if (de->q_terminate) ++ break; ++ ++ pthread_mutex_lock(&de->q_lock); ++ frame = de->q_next; ++ de->q_next = NULL; ++ pthread_mutex_unlock(&de->q_lock); ++ ++ do_display(s, de, frame); ++ ++ av_frame_free(&de->q_this); ++ de->q_this = frame; ++ } ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, ">>> %s\n", __func__); ++#endif ++ ++ return NULL; ++ ++fail: ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__); ++#endif ++ de->q_terminate = 1; ++ sem_post(&de->display_start_sem); ++ ++ return NULL; ++} ++ ++static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt) ++{ ++ const AVFrame *const src_frame = (AVFrame *)pkt->data; ++ AVFrame *frame; ++ egl_display_env_t *const de = s->priv_data; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ ++ if (src_frame->format == AV_PIX_FMT_DRM_PRIME) { ++ frame = av_frame_alloc(); ++ av_frame_ref(frame, src_frame); ++ } ++ else if (src_frame->format == AV_PIX_FMT_VAAPI) { ++ frame = av_frame_alloc(); ++ frame->format = AV_PIX_FMT_DRM_PRIME; ++ if (av_hwframe_map(frame, src_frame, 0) != 0) { ++ av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format); ++ av_frame_free(&frame); ++ return AVERROR(EINVAL); ++ } ++ } ++ else { ++ av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format); ++ return AVERROR(EINVAL); ++ } ++ ++ // Really hacky sync ++ while (de->show_all && de->q_next) { ++ usleep(3000); ++ } ++ ++ pthread_mutex_lock(&de->q_lock); ++ { ++ AVFrame *const t = de->q_next; ++ de->q_next = frame; ++ frame = t; ++ } ++ pthread_mutex_unlock(&de->q_lock); ++ ++ if (frame == NULL) ++ sem_post(&de->q_sem); ++ else ++ av_frame_free(&frame); ++ ++ return 0; ++} ++ ++static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, ++ unsigned flags) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags); ++#endif ++ ++ /* egl_vout_write_header() should have accepted only supported formats */ ++ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY)) ++ return 0; ++ ++ return 0; ++} ++ ++static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type); ++#endif ++ switch (type) { ++ case AV_APP_TO_DEV_WINDOW_REPAINT: ++ return 0; ++ default: ++ break; ++ } ++ return AVERROR(ENOSYS); ++} ++ ++// deinit is called if init fails so no need to clean up explicity here ++static int egl_vout_init(struct AVFormatContext *s) ++{ ++ egl_display_env_t *const de = s->priv_data; ++ unsigned int i; ++ ++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); ++ ++ de->setup = (struct egl_setup) { 0 }; ++ ++ for (i = 0; i != 32; ++i) { ++ de->aux[i].fd = -1; ++ } ++ ++ de->q_terminate = 0; ++ pthread_mutex_init(&de->q_lock, NULL); ++ sem_init(&de->q_sem, 0, 0); ++ sem_init(&de->display_start_sem, 0, 0); ++ av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0); ++ ++ sem_wait(&de->display_start_sem); ++ if (de->q_terminate) { ++ av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__); ++ return -1; ++ } ++ ++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); ++ ++ return 0; ++} ++ ++static void egl_vout_deinit(struct AVFormatContext *s) ++{ ++ egl_display_env_t *const de = s->priv_data; ++ ++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); ++ ++ de->q_terminate = 1; ++ sem_post(&de->q_sem); ++ pthread_join(de->q_thread, NULL); ++ sem_destroy(&de->q_sem); ++ pthread_mutex_destroy(&de->q_lock); ++ ++ av_frame_free(&de->q_next); ++ av_frame_free(&de->q_this); ++ ++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); ++} ++ ++#define OFFSET(x) offsetof(egl_display_env_t, x) ++static const AVOption options[] = { ++ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, { .str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, ++ { NULL } ++ ++}; ++ ++static const AVClass egl_vout_class = { ++ .class_name = "egl vid outdev", ++ .item_name = av_default_item_name, ++ .option = options, ++ .version = LIBAVUTIL_VERSION_INT, ++ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, ++}; ++ ++AVOutputFormat ff_vout_egl_muxer = { ++ .name = "vout_egl", ++ .long_name = NULL_IF_CONFIG_SMALL("Egl video output device"), ++ .priv_data_size = sizeof(egl_display_env_t), ++ .audio_codec = AV_CODEC_ID_NONE, ++ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, ++ .write_header = egl_vout_write_header, ++ .write_packet = egl_vout_write_packet, ++ .write_uncoded_frame = egl_vout_write_frame, ++ .write_trailer = egl_vout_write_trailer, ++ .control_message = egl_vout_control_message, ++ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, ++ .priv_class = &egl_vout_class, ++ .init = egl_vout_init, ++ .deinit = egl_vout_deinit, ++}; ++ +diff --git a/libavdevice/rpi_vout.c b/libavdevice/rpi_vout.c +new file mode 100644 +index 0000000000..84723a34ad +--- /dev/null ++++ b/libavdevice/rpi_vout.c +@@ -0,0 +1,534 @@ ++/* ++ * Copyright (c) 2013 Jeff Moguillansky ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/** ++ * @file ++ * XVideo output device ++ * ++ * TODO: ++ * - add support to more formats ++ */ ++ ++#include "libavutil/opt.h" ++#include "libavutil/avassert.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/imgutils.h" ++#include "libavformat/internal.h" ++#include "avdevice.h" ++ ++#include ++#include ++ ++#pragma GCC diagnostic push ++// Many many redundant decls in the header files ++#pragma GCC diagnostic ignored "-Wredundant-decls" ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#pragma GCC diagnostic pop ++#include "libavutil/rpi_sand_fns.h" ++#include "libavcodec/rpi_zc.h" ++ ++#define TRACE_ALL 0 ++ ++#define DISPLAY_PORT_DEPTH 4 ++ ++typedef struct rpi_display_env_s ++{ ++ AVClass *class; ++ ++ MMAL_COMPONENT_T* display; ++ MMAL_COMPONENT_T* isp; ++ MMAL_PORT_T * port_in; // Input port of either isp or display depending on pipe setup ++ MMAL_CONNECTION_T * conn; ++ ++ MMAL_POOL_T *rpi_pool; ++ volatile int rpi_display_count; ++ ++ MMAL_FOURCC_T req_fmt; ++ MMAL_VIDEO_FORMAT_T req_vfmt; ++ ++ AVZcEnvPtr zc; ++ ++ int window_width, window_height; ++ int window_x, window_y; ++ int layer, fullscreen; ++ int show_all; ++} rpi_display_env_t; ++ ++ ++static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) { ++ mmal_buffer_header_release(buffer); ++} ++ ++static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) { ++ mmal_buffer_header_release(buffer); ++} ++ ++ ++static MMAL_FOURCC_T mmfmt_from_avfmt(const enum AVPixelFormat fmt) ++{ ++ switch (fmt) { ++ case AV_PIX_FMT_SAND128: ++ case AV_PIX_FMT_RPI4_8: ++ return MMAL_ENCODING_YUVUV128; ++ case AV_PIX_FMT_RPI4_10: ++ return MMAL_ENCODING_YUV10_COL; ++ case AV_PIX_FMT_SAND64_10: ++ return MMAL_ENCODING_YUVUV64_10; ++ case AV_PIX_FMT_SAND64_16: ++ return MMAL_ENCODING_YUVUV64_16; ++ case AV_PIX_FMT_YUV420P: ++ return MMAL_ENCODING_I420; ++ ++ default: ++ break; ++ } ++ return 0; ++} ++ ++ ++static void video_format_from_zc_frame(MMAL_ES_FORMAT_T* const es_fmt, ++ const AVFrame * const frame, const AVRpiZcRefPtr fr_ref) ++{ ++ MMAL_VIDEO_FORMAT_T *const vfmt = &es_fmt->es->video; ++ const AVRpiZcFrameGeometry * geo = av_rpi_zc_geometry(fr_ref); ++ if (av_rpi_is_sand_format(geo->format)) { ++ // Sand formats are a bit "special" ++ // stride1 implicit in format ++ // width = stride2 ++ vfmt->width = geo->stripe_is_yc ? ++ geo->height_y + geo->height_c : geo->height_y; ++// es->height = geo->video_height; //*** When we get the FLAG this will change ++ vfmt->height = geo->height_y; ++ es_fmt->flags = MMAL_ES_FORMAT_FLAG_COL_FMTS_WIDTH_IS_COL_STRIDE; ++ } ++ else { ++ vfmt->width = geo->stride_y / geo->bytes_per_pel; ++ vfmt->height = geo->height_y; ++ es_fmt->flags = 0; ++ } ++ ++ es_fmt->type = MMAL_ES_TYPE_VIDEO; ++ es_fmt->encoding = mmfmt_from_avfmt(geo->format); ++ es_fmt->encoding_variant = 0; ++ es_fmt->bitrate = 0; ++ ++ vfmt->crop.x = frame->crop_left; ++ vfmt->crop.y = frame->crop_top; ++ vfmt->crop.width = av_frame_cropped_width(frame); ++ vfmt->crop.height = av_frame_cropped_height(frame); ++ ++ vfmt->frame_rate.den = 0; // Don't think I know it here ++ vfmt->frame_rate.num = 0; ++ ++ vfmt->par.den = frame->sample_aspect_ratio.den; ++ vfmt->par.num = frame->sample_aspect_ratio.num; ++ ++ vfmt->color_space = 0; // Unknown currently ++} ++ ++static MMAL_BOOL_T buf_release_cb(MMAL_BUFFER_HEADER_T * buf, void *userdata) ++{ ++ rpi_display_env_t * const de = userdata; ++ if (buf->user_data != NULL) { ++ av_rpi_zc_unref((AVRpiZcRefPtr)buf->user_data); ++ buf->user_data = NULL; ++ } ++ atomic_fetch_add(&de->rpi_display_count, -1); ++ return MMAL_FALSE; ++} ++ ++static inline int avfmt_needs_isp(const enum AVPixelFormat avfmt) ++{ ++ return avfmt == AV_PIX_FMT_SAND64_10; ++} ++ ++static void isp_remove(AVFormatContext * const s, rpi_display_env_t * const de) ++{ ++ if (de->isp != NULL) ++ { ++ if (de->isp->input[0]->is_enabled) ++ mmal_port_disable(de->isp->input[0]); ++ if (de->isp->control->is_enabled) ++ mmal_port_disable(de->isp->control); ++ } ++ if (de->conn != NULL) { ++ mmal_connection_destroy(de->conn); ++ de->conn = NULL; ++ } ++ if (de->isp != NULL) { ++ mmal_component_destroy(de->isp); ++ de->isp = NULL; ++ } ++} ++ ++static void display_frame(AVFormatContext * const s, rpi_display_env_t * const de, const AVFrame* const fr) ++{ ++ MMAL_BUFFER_HEADER_T* buf = NULL; ++ AVRpiZcRefPtr fr_buf = NULL; ++ ++ if (de == NULL) ++ return; ++ ++ if (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) { ++ av_log(s, AV_LOG_VERBOSE, "Frame dropped\n"); ++ return; ++ } ++ ++ if ((fr_buf = av_rpi_zc_ref(s, de->zc, fr, fr->format, 1)) == NULL) { ++ return; ++ } ++ ++ buf = mmal_queue_get(de->rpi_pool->queue); ++ if (!buf) { ++ // Running too fast so drop the frame (unexpected) ++ goto fail; ++ } ++ ++ buf->cmd = 0; ++ buf->offset = 0; ++ buf->flags = 0; ++ mmal_buffer_header_reset(buf); ++ ++ atomic_fetch_add(&de->rpi_display_count, 1); // Deced on release ++ mmal_buffer_header_pre_release_cb_set(buf, buf_release_cb, de); ++ ++ buf->user_data = fr_buf; ++ buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf); // Cast our handle to a pointer for mmal ++ buf->offset = av_rpi_zc_offset(fr_buf); ++ buf->length = av_rpi_zc_length(fr_buf); ++ buf->alloc_size = av_rpi_zc_numbytes(fr_buf); ++ ++ while (de->show_all && atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) { ++ usleep(5000); ++ } ++ ++ { ++ MMAL_ES_SPECIFIC_FORMAT_T new_ess = {.video = {0}}; ++ MMAL_ES_FORMAT_T new_es = {.es = &new_ess}; ++ MMAL_VIDEO_FORMAT_T * const new_vfmt = &new_ess.video; ++ ++ video_format_from_zc_frame(&new_es, fr, fr_buf); ++ if (de->req_fmt != new_es.encoding || ++ de->req_vfmt.width != new_vfmt->width || ++ de->req_vfmt.height != new_vfmt->height || ++ de->req_vfmt.crop.x != new_vfmt->crop.x || ++ de->req_vfmt.crop.y != new_vfmt->crop.y || ++ de->req_vfmt.crop.width != new_vfmt->crop.width || ++ de->req_vfmt.crop.height != new_vfmt->crop.height) { ++ // Something has changed ++ ++ // If we have an ISP tear it down ++ isp_remove(s, de); ++ de->port_in = de->display->input[0]; ++ ++ // If we still need an ISP create it now ++ if (avfmt_needs_isp(fr->format)) ++ { ++ if (mmal_component_create("vc.ril.isp", &de->isp) != MMAL_SUCCESS) ++ { ++ av_log(s, AV_LOG_ERROR, "ISP creation failed\n"); ++ goto fail; ++ } ++ de->port_in = de->isp->input[0]; ++ } ++ ++ mmal_format_copy(de->port_in->format, &new_es); ++ ++ if (mmal_port_format_commit(de->port_in)) { ++ av_log(s, AV_LOG_ERROR, "Failed to commit input format\n"); ++ goto fail; ++ } ++ ++ // If we have an ISP then we must want to use it ++ if (de->isp != NULL) { ++ MMAL_PORT_T * const port_out = de->isp->output[0]; ++ MMAL_VIDEO_FORMAT_T* vfmt_in = &de->port_in->format->es->video; ++ MMAL_VIDEO_FORMAT_T* vfmt_out = &port_out->format->es->video; ++ ++ port_out->format->type = MMAL_ES_TYPE_VIDEO; ++ port_out->format->encoding = MMAL_ENCODING_YUVUV128; ++ port_out->format->encoding_variant = 0; ++ port_out->format->bitrate = 0; ++ port_out->format->flags = 0; ++ port_out->format->extradata = NULL; ++ port_out->format->extradata_size = 0; ++ ++ vfmt_out->width = (vfmt_in->crop.width + 31) & ~31; ++ vfmt_out->height = (vfmt_in->crop.height + 15) & ~15; ++ vfmt_out->crop.x = 0; ++ vfmt_out->crop.y = 0; ++ vfmt_out->crop.width = vfmt_in->crop.width; ++ vfmt_out->crop.height = vfmt_in->crop.height; ++ vfmt_out->frame_rate = vfmt_in->frame_rate; ++ vfmt_out->par = vfmt_in->par; ++ vfmt_out->color_space = vfmt_in->color_space; ++ ++ if (mmal_port_format_commit(port_out)) { ++ av_log(s, AV_LOG_ERROR, "Failed to commit output format\n"); ++ goto fail; ++ } ++ ++ if (mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING) != MMAL_SUCCESS) { ++ av_log(s, AV_LOG_ERROR, "Failed to create connection\n"); ++ goto fail; ++ } ++ if (mmal_connection_enable(de->conn) != MMAL_SUCCESS) { ++ av_log(s, AV_LOG_ERROR, "Failed to enable connection\n"); ++ goto fail; ++ } ++ mmal_port_enable(de->isp->control,display_cb_control); ++ mmal_component_enable(de->isp); ++ } ++ ++ // Number of slots in my port Q ++ de->port_in->buffer_num = DISPLAY_PORT_DEPTH; ++ // Size to keep it happy - isn't used for anything other than error checking ++ de->port_in->buffer_size = buf->alloc_size; ++ if (!de->port_in->is_enabled) ++ { ++ mmal_port_parameter_set_boolean(de->port_in, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle? Would have expected a vc_image? ++ if (mmal_port_enable(de->port_in, display_cb_input) != MMAL_SUCCESS) { ++ av_log(s, AV_LOG_ERROR, "Failed to enable input port\n"); ++ goto fail; ++ } ++ } ++ ++ de->req_fmt = new_es.encoding; ++ de->req_vfmt = *new_vfmt; ++ } ++ } ++ ++ if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS) ++ { ++ av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count); ++ goto fail; ++ } ++ return; ++ ++fail: ++ // If we have a buf then fr_buf is held by that ++ if (buf != NULL) ++ mmal_buffer_header_release(buf); ++ else if (fr_buf != NULL) ++ av_rpi_zc_unref(fr_buf); ++} ++ ++ ++static int xv_write_trailer(AVFormatContext *s) ++{ ++ rpi_display_env_t * const de = s->priv_data; ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ if (de->port_in != NULL && de->port_in->is_enabled) { ++ mmal_port_disable(de->port_in); ++ } ++ ++ // The above disable should kick out all buffers - check that ++ if (atomic_load(&de->rpi_display_count) != 0) { ++ av_log(s, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", atomic_load(&de->rpi_display_count)); ++ } ++ ++ isp_remove(s, de); ++ if (de->rpi_pool != NULL) { ++ mmal_pool_destroy(de->rpi_pool); ++ de->rpi_pool = NULL; ++ } ++ if (de->display != NULL) { ++ mmal_component_destroy(de->display); ++ de->display = NULL; ++ } ++ ++ return 0; ++} ++ ++static int xv_write_header(AVFormatContext *s) ++{ ++ rpi_display_env_t * const de = s->priv_data; ++ const AVCodecParameters * const par = s->streams[0]->codecpar; ++ const unsigned int w = de->window_width ? de->window_width : par->width; ++ const unsigned int h = de->window_height ? de->window_height : par->height; ++ const unsigned int x = de->window_x; ++ const unsigned int y = de->window_y; ++ const int layer = de->layer ? de->layer : 2; ++ const MMAL_BOOL_T fullscreen = de->fullscreen; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s: %dx%d\n", __func__, w, h); ++#endif ++ if ( s->nb_streams > 1 ++ || par->codec_type != AVMEDIA_TYPE_VIDEO ++ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { ++ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ { ++ MMAL_DISPLAYREGION_T region = ++ { ++ .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)}, ++ .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | ++ MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_ALPHA, ++ .layer = layer, ++ .fullscreen = fullscreen, ++ .dest_rect = {x, y, w, h}, ++ .alpha = !fullscreen ? 0xff : 0xff | MMAL_DISPLAY_ALPHA_FLAGS_DISCARD_LOWER_LAYERS, ++ }; ++ ++ bcm_host_init(); // Needs to be done by someone... ++ ++ if (mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display) != MMAL_SUCCESS) ++ { ++ av_log(s, AV_LOG_ERROR, "Failed to create display component\n"); ++ goto fail; ++ } ++ de->port_in = de->display->input[0]; ++ ++ mmal_port_parameter_set(de->display->input[0], ®ion.hdr); ++ ++ if (mmal_component_enable(de->display) != MMAL_SUCCESS) ++ { ++ av_log(s, AV_LOG_ERROR, "Failed to enable display component\n"); ++ goto fail; ++ } ++ if (mmal_port_enable(de->display->control,display_cb_control) != MMAL_SUCCESS) ++ { ++ av_log(s, AV_LOG_ERROR, "Failed to enable display control port\n"); ++ goto fail; ++ } ++ ++ if ((de->rpi_pool = mmal_pool_create(DISPLAY_PORT_DEPTH, 0)) == NULL) ++ { ++ av_log(s, AV_LOG_ERROR, "Failed to create pool\n"); ++ goto fail; ++ } ++ } ++ ++ return 0; ++ ++fail: ++ xv_write_trailer(s); ++ return AVERROR_UNKNOWN; ++} ++ ++static int xv_write_packet(AVFormatContext *s, AVPacket *pkt) ++{ ++ AVFrame * const frame = (AVFrame *)pkt->data; ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ display_frame(s, s->priv_data, frame); ++ return 0; ++} ++ ++static int xv_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, ++ unsigned flags) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags); ++#endif ++ ++ /* xv_write_header() should have accepted only supported formats */ ++ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY)) ++ return 0; ++// return write_picture(s, (*frame)->data, (*frame)->linesize); ++ ++ display_frame(s, s->priv_data, *ppframe); ++ return 0; ++} ++ ++static int xv_control_message(AVFormatContext *s, int type, void *data, size_t data_size) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type); ++#endif ++ switch(type) { ++ case AV_APP_TO_DEV_WINDOW_REPAINT: ++ return 0; ++ default: ++ break; ++ } ++ return AVERROR(ENOSYS); ++} ++ ++// deinit is called if init fails so no need to clean up explicity here ++static int rpi_vout_init(struct AVFormatContext * s) ++{ ++ rpi_display_env_t * const de = s->priv_data; ++ ++ // Get a ZC context in case we need one - has little overhead if unused ++ if ((de->zc = av_rpi_zc_int_env_alloc(s)) == NULL) ++ return 1; ++ ++ return 0; ++} ++ ++static void rpi_vout_deinit(struct AVFormatContext * s) ++{ ++ rpi_display_env_t * const de = s->priv_data; ++ ++ av_rpi_zc_int_env_freep(&de->zc); ++} ++ ++ ++#define OFFSET(x) offsetof(rpi_display_env_t, x) ++static const AVOption options[] = { ++ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "display_layer","set display layer", OFFSET(layer), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, ++ { NULL } ++ ++}; ++ ++static const AVClass xv_class = { ++ .class_name = "rpi vid outdev", ++ .item_name = av_default_item_name, ++ .option = options, ++ .version = LIBAVUTIL_VERSION_INT, ++ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, ++}; ++ ++AVOutputFormat ff_vout_rpi_muxer = { ++ .name = "vout_rpi", ++ .long_name = NULL_IF_CONFIG_SMALL("Rpi (mmal) video output device"), ++ .priv_data_size = sizeof(rpi_display_env_t), ++ .audio_codec = AV_CODEC_ID_NONE, ++ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, ++ .write_header = xv_write_header, ++ .write_packet = xv_write_packet, ++ .write_uncoded_frame = xv_write_frame, ++ .write_trailer = xv_write_trailer, ++ .control_message = xv_control_message, ++ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, ++ .priv_class = &xv_class, ++ .init = rpi_vout_init, ++ .deinit = rpi_vout_deinit, ++}; +diff --git a/libavfilter/Makefile b/libavfilter/Makefile +index b2c254ea67..144fbda652 100644 +--- a/libavfilter/Makefile ++++ b/libavfilter/Makefile +@@ -233,6 +233,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER) += vf_neighbor.o + OBJS-$(CONFIG_DEFLICKER_FILTER) += vf_deflicker.o + OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER) += vf_deinterlace_qsv.o + OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER) += vf_deinterlace_vaapi.o vaapi_vpp.o ++OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER) += vf_deinterlace_v4l2m2m.o + OBJS-$(CONFIG_DEJUDDER_FILTER) += vf_dejudder.o + OBJS-$(CONFIG_DELOGO_FILTER) += vf_delogo.o + OBJS-$(CONFIG_DENOISE_VAAPI_FILTER) += vf_misc_vaapi.o vaapi_vpp.o +@@ -459,6 +460,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER) += vf_transpose_opencl.o opencl.o o + OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER) += vf_transpose_vaapi.o vaapi_vpp.o + OBJS-$(CONFIG_TRIM_FILTER) += trim.o + OBJS-$(CONFIG_UNPREMULTIPLY_FILTER) += vf_premultiply.o framesync.o ++OBJS-$(CONFIG_UNSAND_FILTER) += vf_unsand.o + OBJS-$(CONFIG_UNSHARP_FILTER) += vf_unsharp.o + OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER) += vf_unsharp_opencl.o opencl.o \ + opencl/unsharp.o +diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile +index b58daa3a3f..b68209bc94 100644 +--- a/libavfilter/aarch64/Makefile ++++ b/libavfilter/aarch64/Makefile +@@ -1,3 +1,5 @@ ++OBJS-$(CONFIG_BWDIF_FILTER) += aarch64/vf_bwdif_init_aarch64.o + OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_init.o + ++NEON-OBJS-$(CONFIG_BWDIF_FILTER) += aarch64/vf_bwdif_neon.o + NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_neon.o +diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c +new file mode 100644 +index 0000000000..f52bc4b9b4 +--- /dev/null ++++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c +@@ -0,0 +1,125 @@ ++/* ++ * bwdif aarch64 NEON optimisations ++ * ++ * Copyright (c) 2023 John Cox ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/common.h" ++#include "libavfilter/bwdif.h" ++#include "libavutil/aarch64/cpu.h" ++ ++void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int parity, int clip_max, int spat); ++ ++void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, ++ int prefs3, int mrefs3, int parity, int clip_max); ++ ++void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int prefs3, int mrefs3, int prefs4, int mrefs4, ++ int parity, int clip_max); ++ ++void ff_bwdif_filter_line3_neon(void * dst1, int d_stride, ++ const void * prev1, const void * cur1, const void * next1, int s_stride, ++ int w, int parity, int clip_max); ++ ++ ++static void filter_line3_helper(void * dst1, int d_stride, ++ const void * prev1, const void * cur1, const void * next1, int s_stride, ++ int w, int parity, int clip_max) ++{ ++ // Asm works on 16 byte chunks ++ // If w is a multiple of 16 then all is good - if not then if width rounded ++ // up to nearest 16 will fit in both src & dst strides then allow the asm ++ // to write over the padding bytes as that is almost certainly faster than ++ // having to invoke the C version to clean up the tail. ++ const int w1 = FFALIGN(w, 16); ++ const int w0 = clip_max != 255 ? 0 : ++ d_stride <= w1 && s_stride <= w1 ? w : w & ~15; ++ ++ ff_bwdif_filter_line3_neon(dst1, d_stride, ++ prev1, cur1, next1, s_stride, ++ w0, parity, clip_max); ++ ++ if (w0 < w) ++ ff_bwdif_filter_line3_c((char *)dst1 + w0, d_stride, ++ (const char *)prev1 + w0, (const char *)cur1 + w0, (const char *)next1 + w0, s_stride, ++ w - w0, parity, clip_max); ++} ++ ++static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int prefs3, int mrefs3, int prefs4, int mrefs4, ++ int parity, int clip_max) ++{ ++ const int w0 = clip_max != 255 ? 0 : w & ~15; ++ ++ ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1, ++ w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); ++ ++ if (w0 < w) ++ ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, ++ w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); ++} ++ ++static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int parity, int clip_max, int spat) ++{ ++ const int w0 = clip_max != 255 ? 0 : w & ~15; ++ ++ ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2, ++ parity, clip_max, spat); ++ ++ if (w0 < w) ++ ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, ++ w - w0, prefs, mrefs, prefs2, mrefs2, ++ parity, clip_max, spat); ++} ++ ++static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs, ++ int prefs3, int mrefs3, int parity, int clip_max) ++{ ++ const int w0 = clip_max != 255 ? 0 : w & ~15; ++ ++ ff_bwdif_filter_intra_neon(dst1, cur1, w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max); ++ ++ if (w0 < w) ++ ff_bwdif_filter_intra_c((char *)dst1 + w0, (char *)cur1 + w0, ++ w - w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max); ++} ++ ++void ++ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) ++{ ++ const int cpu_flags = av_get_cpu_flags(); ++ ++ if (bit_depth != 8) ++ return; ++ ++ if (!have_neon(cpu_flags)) ++ return; ++ ++ s->filter_intra = filter_intra_helper; ++ s->filter_line = filter_line_helper; ++ s->filter_edge = filter_edge_helper; ++ s->filter_line3 = filter_line3_helper; ++} ++ +diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S +new file mode 100644 +index 0000000000..ae9aab20cd +--- /dev/null ++++ b/libavfilter/aarch64/vf_bwdif_neon.S +@@ -0,0 +1,788 @@ ++/* ++ * bwdif aarch64 NEON optimisations ++ * ++ * Copyright (c) 2023 John Cox ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ ++#include "libavutil/aarch64/asm.S" ++ ++// Space taken on the stack by an int (32-bit) ++#ifdef __APPLE__ ++.set SP_INT, 4 ++#else ++.set SP_INT, 8 ++#endif ++ ++.macro SQSHRUNN b, s0, s1, s2, s3, n ++ sqshrun \s0\().4h, \s0\().4s, #\n - 8 ++ sqshrun2 \s0\().8h, \s1\().4s, #\n - 8 ++ sqshrun \s1\().4h, \s2\().4s, #\n - 8 ++ sqshrun2 \s1\().8h, \s3\().4s, #\n - 8 ++ uzp2 \b\().16b, \s0\().16b, \s1\().16b ++.endm ++ ++.macro SMULL4K a0, a1, a2, a3, s0, s1, k ++ smull \a0\().4s, \s0\().4h, \k ++ smull2 \a1\().4s, \s0\().8h, \k ++ smull \a2\().4s, \s1\().4h, \k ++ smull2 \a3\().4s, \s1\().8h, \k ++.endm ++ ++.macro UMULL4K a0, a1, a2, a3, s0, s1, k ++ umull \a0\().4s, \s0\().4h, \k ++ umull2 \a1\().4s, \s0\().8h, \k ++ umull \a2\().4s, \s1\().4h, \k ++ umull2 \a3\().4s, \s1\().8h, \k ++.endm ++ ++.macro UMLAL4K a0, a1, a2, a3, s0, s1, k ++ umlal \a0\().4s, \s0\().4h, \k ++ umlal2 \a1\().4s, \s0\().8h, \k ++ umlal \a2\().4s, \s1\().4h, \k ++ umlal2 \a3\().4s, \s1\().8h, \k ++.endm ++ ++.macro UMLSL4K a0, a1, a2, a3, s0, s1, k ++ umlsl \a0\().4s, \s0\().4h, \k ++ umlsl2 \a1\().4s, \s0\().8h, \k ++ umlsl \a2\().4s, \s1\().4h, \k ++ umlsl2 \a3\().4s, \s1\().8h, \k ++.endm ++ ++// int b = m2s1 - m1; ++// int f = p2s1 - p1; ++// int dc = c0s1 - m1; ++// int de = c0s1 - p1; ++// int sp_max = FFMIN(p1 - c0s1, m1 - c0s1); ++// sp_max = FFMIN(sp_max, FFMAX(-b,-f)); ++// int sp_min = FFMIN(c0s1 - p1, c0s1 - m1); ++// sp_min = FFMIN(sp_min, FFMAX(b,f)); ++// diff = diff == 0 ? 0 : FFMAX3(diff, sp_min, sp_max); ++.macro SPAT_CHECK diff, m2s1, m1, c0s1, p1, p2s1, t0, t1, t2, t3 ++ uqsub \t0\().16b, \p1\().16b, \c0s1\().16b ++ uqsub \t2\().16b, \m1\().16b, \c0s1\().16b ++ umin \t2\().16b, \t0\().16b, \t2\().16b ++ ++ uqsub \t1\().16b, \m1\().16b, \m2s1\().16b ++ uqsub \t3\().16b, \p1\().16b, \p2s1\().16b ++ umax \t3\().16b, \t3\().16b, \t1\().16b ++ umin \t3\().16b, \t3\().16b, \t2\().16b ++ ++ uqsub \t0\().16b, \c0s1\().16b, \p1\().16b ++ uqsub \t2\().16b, \c0s1\().16b, \m1\().16b ++ umin \t2\().16b, \t0\().16b, \t2\().16b ++ ++ uqsub \t1\().16b, \m2s1\().16b, \m1\().16b ++ uqsub \t0\().16b, \p2s1\().16b, \p1\().16b ++ umax \t0\().16b, \t0\().16b, \t1\().16b ++ umin \t2\().16b, \t2\().16b, \t0\().16b ++ ++ cmeq \t1\().16b, \diff\().16b, #0 ++ umax \diff\().16b, \diff\().16b, \t3\().16b ++ umax \diff\().16b, \diff\().16b, \t2\().16b ++ bic \diff\().16b, \diff\().16b, \t1\().16b ++.endm ++ ++// i0 = s0; ++// if (i0 > d0 + diff0) ++// i0 = d0 + diff0; ++// else if (i0 < d0 - diff0) ++// i0 = d0 - diff0; ++// ++// i0 = s0 is safe ++.macro DIFF_CLIP i0, s0, d0, diff, t0, t1 ++ uqadd \t0\().16b, \d0\().16b, \diff\().16b ++ uqsub \t1\().16b, \d0\().16b, \diff\().16b ++ umin \i0\().16b, \s0\().16b, \t0\().16b ++ umax \i0\().16b, \i0\().16b, \t1\().16b ++.endm ++ ++// i0 = FFABS(m1 - p1) > td0 ? i1 : i2; ++// DIFF_CLIP ++// ++// i0 = i1 is safe ++.macro INTERPOL i0, i1, i2, m1, d0, p1, td0, diff, t0, t1, t2 ++ uabd \t0\().16b, \m1\().16b, \p1\().16b ++ cmhi \t0\().16b, \t0\().16b, \td0\().16b ++ bsl \t0\().16b, \i1\().16b, \i2\().16b ++ DIFF_CLIP \i0, \t0, \d0, \diff, \t1, \t2 ++.endm ++ ++.macro PUSH_VREGS ++ stp d8, d9, [sp, #-64]! ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++.endm ++ ++.macro POP_VREGS ++ ldp d14, d15, [sp, #48] ++ ldp d12, d13, [sp, #32] ++ ldp d10, d11, [sp, #16] ++ ldp d8, d9, [sp], #64 ++.endm ++ ++.macro LDR_COEFFS d, t0 ++ movrel \t0, coeffs, 0 ++ ld1 {\d\().8h}, [\t0] ++.endm ++ ++// static const uint16_t coef_lf[2] = { 4309, 213 }; ++// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 }; ++// static const uint16_t coef_sp[2] = { 5077, 981 }; ++ ++const coeffs, align=4 // align 4 means align on 2^4 boundry ++ .hword 4309 * 4, 213 * 4 // lf[0]*4 = v0.h[0] ++ .hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5] ++ .hword 5077, 981 // sp[0] = v0.h[6] ++endconst ++ ++// =========================================================================== ++// ++// void ff_bwdif_filter_line3_neon( ++// void * dst1, // x0 ++// int d_stride, // w1 ++// const void * prev1, // x2 ++// const void * cur1, // x3 ++// const void * next1, // x4 ++// int s_stride, // w5 ++// int w, // w6 ++// int parity, // w7 ++// int clip_max); // [sp, #0] (Ignored) ++ ++function ff_bwdif_filter_line3_neon, export=1 ++ // Sanity check w ++ cmp w6, #0 ++ ble 99f ++ ++ LDR_COEFFS v0, x17 ++ ++// #define prev2 cur ++// const uint8_t * restrict next2 = parity ? prev : next; ++ cmp w7, #0 ++ csel x17, x2, x4, ne ++ ++ // We want all the V registers - save all the ones we must ++ PUSH_VREGS ++ ++ // Some rearrangement of initial values for nice layout of refs in regs ++ mov w10, w6 // w10 = loop count ++ neg w9, w5 // w9 = mref ++ lsl w8, w9, #1 // w8 = mref2 ++ add w7, w9, w9, LSL #1 // w7 = mref3 ++ lsl w6, w9, #2 // w6 = mref4 ++ mov w11, w5 // w11 = pref ++ lsl w12, w5, #1 // w12 = pref2 ++ add w13, w5, w5, LSL #1 // w13 = pref3 ++ lsl w14, w5, #2 // w14 = pref4 ++ add w15, w5, w5, LSL #2 // w15 = pref5 ++ add w16, w14, w12 // w16 = pref6 ++ ++ lsl w5, w1, #1 // w5 = d_stride * 2 ++ ++// for (x = 0; x < w; x++) { ++// int diff0, diff2; ++// int d0, d2; ++// int temporal_diff0, temporal_diff2; ++// ++// int i1, i2; ++// int j1, j2; ++// int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4; ++ ++10: ++// c0 = prev2[0] + next2[0]; // c0 = v20, v21 ++// d0 = c0 >> 1; // d0 = v10 ++// temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11 ++ ldr q31, [x3] ++ ldr q21, [x17] ++ uhadd v10.16b, v31.16b, v21.16b ++ uabd v11.16b, v31.16b, v21.16b ++ uaddl v20.8h, v21.8b, v31.8b ++ uaddl2 v21.8h, v21.16b, v31.16b ++ ++ ldr q31, [x3, w6, sxtw] ++ ldr q23, [x17, w6, sxtw] ++ ++// i1 = coef_hf[0] * c0; // i1 = v2-v5 ++ UMULL4K v2, v3, v4, v5, v20, v21, v0.h[2] ++ ++ ldr q30, [x3, w14, sxtw] ++ ldr q25, [x17, w14, sxtw] ++ ++// m4 = prev2[mrefs4] + next2[mrefs4]; // m4 = v22,v23 ++ uaddl v22.8h, v23.8b, v31.8b ++ uaddl2 v23.8h, v23.16b, v31.16b ++ ++// p4 = prev2[prefs4] + next2[prefs4]; // p4 = v24,v25, (p4 >> 1) = v12 ++ uhadd v12.16b, v25.16b, v30.16b ++ uaddl v24.8h, v25.8b, v30.8b ++ uaddl2 v25.8h, v25.16b, v30.16b ++ ++// j1 = -coef_hf[1] * (c0 + p4); // j1 = v6-v9 (-c0:v20,v21) ++ add v20.8h, v20.8h, v24.8h ++ add v21.8h, v21.8h, v25.8h ++ SMULL4K v6, v7, v8, v9, v20, v21, v0.h[5] ++ ++// m3 = cur[mrefs3]; // m3 = v20 ++ ldr q20, [x3, w7, sxtw] ++ ++// p3 = cur[prefs3]; // p3 = v21 ++ ldr q21, [x3, w13, sxtw] ++ ++// i1 += coef_hf[2] * (m4 + p4); // (-m4:v22,v23) (-p4:v24,v25) ++ add v22.8h, v22.8h, v24.8h ++ add v23.8h, v23.8h, v25.8h ++ UMLAL4K v2, v3, v4, v5, v22, v23, v0.h[4] ++ ++ ldr q29, [x3, w8, sxtw] ++ ldr q23, [x17, w8, sxtw] ++ ++// i1 -= coef_lf[1] * 4 * (m3 + p3); // - ++ uaddl v30.8h, v20.8b, v21.8b ++ uaddl2 v31.8h, v20.16b, v21.16b ++ ++ ldr q28, [x3, w16, sxtw] ++ ldr q25, [x17, w16, sxtw] ++ ++ UMLSL4K v2, v3, v4, v5, v30, v31, v0.h[1] ++ ++// m2 = prev2[mrefs2] + next2[mrefs2]; // m2 = v22,v23, (m2 >> 1) = v13 ++ uhadd v13.16b, v23.16b, v29.16b ++ uaddl v22.8h, v23.8b, v29.8b ++ uaddl2 v23.8h, v23.16b, v29.16b ++ ++ ldr q31, [x3, w12, sxtw] ++ ldr q27, [x17, w12, sxtw] ++ ++// p6 = prev2[prefs6] + next2[prefs6]; // p6 = v24,v25 ++ uaddl v24.8h, v25.8b, v28.8b ++ uaddl2 v25.8h, v25.16b, v28.16b ++ ++// j1 += coef_hf[2] * (m2 + p6); // (-p6:v24,v25) ++ add v24.8h, v24.8h, v22.8h ++ add v25.8h, v25.8h, v23.8h ++ UMLAL4K v6, v7, v8, v9, v24, v25, v0.h[4] ++ ++// m1 = cur[mrefs]; // m1 = v24 ++ ldr q24, [x3, w9, sxtw] ++ ++// p5 = cur[prefs5]; // p5 = v25 ++ ldr q25, [x3, w15, sxtw] ++ ++// p2 = prev2[prefs2] + next2[prefs2]; // p2 = v26, v27 ++// temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14 ++// d2 = p2 >> 1; // d2 = v15 ++ uabd v14.16b, v31.16b, v27.16b ++ uhadd v15.16b, v31.16b, v27.16b ++ uaddl v26.8h, v27.8b, v31.8b ++ uaddl2 v27.8h, v27.16b, v31.16b ++ ++// j1 += coef_hf[0] * p2; // - ++ UMLAL4K v6, v7, v8, v9, v26, v27, v0.h[2] ++ ++// i1 -= coef_hf[1] * (m2 + p2); // (-m2:v22,v23*) (-p2:v26*,v27*) ++ add v22.8h, v22.8h, v26.8h ++ add v23.8h, v23.8h, v27.8h ++ UMLSL4K v2, v3, v4, v5, v22, v23, v0.h[3] ++ ++// p1 = cur[prefs]; // p1 = v22 ++ ldr q22, [x3, w11, sxtw] ++ ++// j1 -= coef_lf[1] * 4 * (m1 + p5); // - ++ uaddl v26.8h, v24.8b, v25.8b ++ uaddl2 v27.8h, v24.16b, v25.16b ++ UMLSL4K v6, v7, v8, v9, v26, v27, v0.h[1] ++ ++// j2 = (coef_sp[0] * (p1 + p3) - coef_sp[1] * (m1 + p5)) >> 13; // (-p5:v25*) j2=v16 ++ uaddl v18.8h, v22.8b, v21.8b ++ uaddl2 v19.8h, v22.16b, v21.16b ++ UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6] ++ ++ uaddl v18.8h, v24.8b, v25.8b ++ uaddl2 v19.8h, v24.16b, v25.16b ++ UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7] ++ ++ SQSHRUNN v16, v28, v29, v30, v31, 13 ++ ++// i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17 ++ uaddl v18.8h, v22.8b, v24.8b ++ uaddl2 v19.8h, v22.16b, v24.16b ++ UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6] ++ ++ uaddl v18.8h, v20.8b, v21.8b ++ uaddl2 v19.8h, v20.16b, v21.16b ++ UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7] ++ ++ SQSHRUNN v17, v28, v29, v30, v31, 13 ++ ++// i1 += coef_lf[0] * 4 * (m1 + p1); // p1 = v22, m1 = v24 ++ uaddl v26.8h, v24.8b, v22.8b ++ uaddl2 v27.8h, v24.16b, v22.16b ++ UMLAL4K v2, v3, v4, v5, v26, v27, v0.h[0] ++ ++ ldr q31, [x2, w9, sxtw] ++ ldr q29, [x4, w9, sxtw] ++ ++// j1 += coef_lf[0] * 4 * (p1 + p3); // p1 = v22, p3 = v21 ++ uaddl v26.8h, v21.8b, v22.8b ++ uaddl2 v27.8h, v21.16b, v22.16b ++ UMLAL4K v6, v7, v8, v9, v26, v27, v0.h[0] ++ ++ ldr q30, [x2, w11, sxtw] ++ ldr q28, [x4, w11, sxtw] ++ ++// i1 >>= 15; // i1 = v2, -v3, -v4*, -v5* ++ SQSHRUNN v2, v2, v3, v4, v5, 15 ++ ++// j1 >>= 15; // j1 = v3, -v6*, -v7*, -v8*, -v9* ++ SQSHRUNN v3, v6, v7, v8, v9, 15 ++ ++// { ++// int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; ++// int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; ++ uabd v30.16b, v22.16b, v30.16b ++ uabd v31.16b, v24.16b, v31.16b ++ uabd v28.16b, v22.16b, v28.16b ++ uabd v29.16b, v24.16b, v29.16b ++ uhadd v31.16b, v31.16b, v30.16b ++ uhadd v29.16b, v29.16b, v28.16b ++ ++ ldr q27, [x2, w13, sxtw] ++ ldr q26, [x4, w13, sxtw] ++ ++// diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18 ++ ushr v18.16b, v11.16b, #1 ++ umax v18.16b, v18.16b, v31.16b ++ umax v18.16b, v18.16b, v29.16b ++// } // v28, v30 preserved for next block ++// { // tdiff2 = v14 ++// int t1 =(FFABS(prev[prefs] - p1) + FFABS(prev[prefs3] - p3)) >> 1; ++// int t2 =(FFABS(next[prefs] - p1) + FFABS(next[prefs3] - p3)) >> 1; ++ uabd v31.16b, v21.16b, v27.16b ++ uabd v29.16b, v21.16b, v26.16b ++ uhadd v31.16b, v31.16b, v30.16b ++ uhadd v29.16b, v29.16b, v28.16b ++ ++// diff2 = FFMAX3(temporal_diff2 >> 1, t1, t2); // diff2=v19 ++ ushr v19.16b, v14.16b, #1 ++ umax v19.16b, v19.16b, v31.16b ++ umax v19.16b, v19.16b, v29.16b ++// } ++ ++ // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15 ++ SPAT_CHECK v18, v13, v24, v10, v22, v15, v31, v30, v29, v28 ++ ++ // diff2 = v19, d0 = v10, p1 = v22, d2 = v15, p3 = v21, (p4 >> 1) = v12 ++ SPAT_CHECK v19, v10, v22, v15, v21, v12, v31, v30, v29, v28 ++ ++ // j1 = v3, j2 = v16, p1 = v22, d2 = v15, p3 = v21, td2 = v14, diff2 = v19 ++ INTERPOL v3, v3, v16, v22, v15, v21, v14, v19, v31, v30, v29 ++ ++// dst[d_stride * 2] = av_clip_uint8(interpol); ++ str q3, [x0, w5, sxtw] ++ ++// dst[d_stride] = p1; ++ str q22, [x0, w1, sxtw] ++ ++ // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18 ++ INTERPOL v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29 ++ ++// dst[0] = av_clip_uint8(interpol); ++ str q2, [x0], #16 ++// } ++// ++// dst++; ++// cur++; ++// prev++; ++// prev2++; ++// next++; ++// } ++ subs w10, w10, #16 ++ add x2, x2, #16 ++ add x3, x3, #16 ++ add x4, x4, #16 ++ add x17, x17, #16 ++ bgt 10b ++ ++ POP_VREGS ++99: ++ ret ++endfunc ++ ++// =========================================================================== ++// ++// void filter_line( ++// void *dst1, // x0 ++// void *prev1, // x1 ++// void *cur1, // x2 ++// void *next1, // x3 ++// int w, // w4 ++// int prefs, // w5 ++// int mrefs, // w6 ++// int prefs2, // w7 ++// int mrefs2, // [sp, #0] ++// int prefs3, // [sp, #SP_INT] ++// int mrefs3, // [sp, #SP_INT*2] ++// int prefs4, // [sp, #SP_INT*3] ++// int mrefs4, // [sp, #SP_INT*4] ++// int parity, // [sp, #SP_INT*5] ++// int clip_max) // [sp, #SP_INT*6] ++ ++function ff_bwdif_filter_line_neon, export=1 ++ // Sanity check w ++ cmp w4, #0 ++ ble 99f ++ ++ // Rearrange regs to be the same as line3 for ease of debug! ++ mov w10, w4 // w10 = loop count ++ mov w9, w6 // w9 = mref ++ mov w12, w7 // w12 = pref2 ++ mov w11, w5 // w11 = pref ++ ldr w8, [sp, #0] // w8 = mref2 ++ ldr w7, [sp, #SP_INT*2] // w7 = mref3 ++ ldr w6, [sp, #SP_INT*4] // w6 = mref4 ++ ldr w13, [sp, #SP_INT] // w13 = pref3 ++ ldr w14, [sp, #SP_INT*3] // w14 = pref4 ++ ++ mov x4, x3 ++ mov x3, x2 ++ mov x2, x1 ++ ++ LDR_COEFFS v0, x17 ++ ++// #define prev2 cur ++// const uint8_t * restrict next2 = parity ? prev : next; ++ ldr w17, [sp, #SP_INT*5] // parity ++ cmp w17, #0 ++ csel x17, x2, x4, ne ++ ++ PUSH_VREGS ++ ++// for (x = 0; x < w; x++) { ++// int diff0, diff2; ++// int d0, d2; ++// int temporal_diff0, temporal_diff2; ++// ++// int i1, i2; ++// int j1, j2; ++// int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4; ++ ++10: ++// c0 = prev2[0] + next2[0]; // c0 = v20, v21 ++// d0 = c0 >> 1; // d0 = v10 ++// temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11 ++ ldr q31, [x3] ++ ldr q21, [x17] ++ uhadd v10.16b, v31.16b, v21.16b ++ uabd v11.16b, v31.16b, v21.16b ++ uaddl v20.8h, v21.8b, v31.8b ++ uaddl2 v21.8h, v21.16b, v31.16b ++ ++ ldr q31, [x3, w6, sxtw] ++ ldr q23, [x17, w6, sxtw] ++ ++// i1 = coef_hf[0] * c0; // i1 = v2-v5 ++ UMULL4K v2, v3, v4, v5, v20, v21, v0.h[2] ++ ++ ldr q30, [x3, w14, sxtw] ++ ldr q25, [x17, w14, sxtw] ++ ++// m4 = prev2[mrefs4] + next2[mrefs4]; // m4 = v22,v23 ++ uaddl v22.8h, v23.8b, v31.8b ++ uaddl2 v23.8h, v23.16b, v31.16b ++ ++// p4 = prev2[prefs4] + next2[prefs4]; // p4 = v24,v25, (p4 >> 1) = v12 ++ uhadd v12.16b, v25.16b, v30.16b ++ uaddl v24.8h, v25.8b, v30.8b ++ uaddl2 v25.8h, v25.16b, v30.16b ++ ++// m3 = cur[mrefs3]; // m3 = v20 ++ ldr q20, [x3, w7, sxtw] ++ ++// p3 = cur[prefs3]; // p3 = v21 ++ ldr q21, [x3, w13, sxtw] ++ ++// i1 += coef_hf[2] * (m4 + p4); // (-m4:v22,v23) (-p4:v24,v25) ++ add v22.8h, v22.8h, v24.8h ++ add v23.8h, v23.8h, v25.8h ++ UMLAL4K v2, v3, v4, v5, v22, v23, v0.h[4] ++ ++ ldr q29, [x3, w8, sxtw] ++ ldr q23, [x17, w8, sxtw] ++ ++// i1 -= coef_lf[1] * 4 * (m3 + p3); // - ++ uaddl v30.8h, v20.8b, v21.8b ++ uaddl2 v31.8h, v20.16b, v21.16b ++ ++ UMLSL4K v2, v3, v4, v5, v30, v31, v0.h[1] ++ ++ ldr q31, [x3, w12, sxtw] ++ ldr q27, [x17, w12, sxtw] ++ ++// m2 = prev2[mrefs2] + next2[mrefs2]; // m2 = v22,v23, (m2 >> 1) = v13 ++ uhadd v13.16b, v23.16b, v29.16b ++ uaddl v22.8h, v23.8b, v29.8b ++ uaddl2 v23.8h, v23.16b, v29.16b ++ ++// m1 = cur[mrefs]; // m1 = v24 ++ ldr q24, [x3, w9, sxtw] ++ ++// p2 = prev2[prefs2] + next2[prefs2]; // p2 = v26, v27 ++// temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14 ++// d2 = p2 >> 1; // d2 = v15 ++ uabd v14.16b, v31.16b, v27.16b ++ uhadd v15.16b, v31.16b, v27.16b ++ uaddl v26.8h, v27.8b, v31.8b ++ uaddl2 v27.8h, v27.16b, v31.16b ++ ++// i1 -= coef_hf[1] * (m2 + p2); // (-m2:v22,v23*) (-p2:v26*,v27*) ++ add v22.8h, v22.8h, v26.8h ++ add v23.8h, v23.8h, v27.8h ++ UMLSL4K v2, v3, v4, v5, v22, v23, v0.h[3] ++ ++// p1 = cur[prefs]; // p1 = v22 ++ ldr q22, [x3, w11, sxtw] ++ ++// i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17 ++ uaddl v18.8h, v22.8b, v24.8b ++ uaddl2 v19.8h, v22.16b, v24.16b ++ UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6] ++ ++ uaddl v18.8h, v20.8b, v21.8b ++ uaddl2 v19.8h, v20.16b, v21.16b ++ UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7] ++ ++ SQSHRUNN v17, v28, v29, v30, v31, 13 ++ ++// i1 += coef_lf[0] * 4 * (m1 + p1); // p1 = v22, m1 = v24 ++ uaddl v26.8h, v24.8b, v22.8b ++ uaddl2 v27.8h, v24.16b, v22.16b ++ UMLAL4K v2, v3, v4, v5, v26, v27, v0.h[0] ++ ++ ldr q31, [x2, w9, sxtw] ++ ldr q29, [x4, w9, sxtw] ++ ++ ldr q30, [x2, w11, sxtw] ++ ldr q28, [x4, w11, sxtw] ++ ++// i1 >>= 15; // i1 = v2, -v3, -v4*, -v5* ++ SQSHRUNN v2, v2, v3, v4, v5, 15 ++ ++// { ++// int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; ++// int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; ++ uabd v30.16b, v22.16b, v30.16b ++ uabd v31.16b, v24.16b, v31.16b ++ uabd v28.16b, v22.16b, v28.16b ++ uabd v29.16b, v24.16b, v29.16b ++ uhadd v31.16b, v31.16b, v30.16b ++ uhadd v29.16b, v29.16b, v28.16b ++ ++// diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18 ++ ushr v18.16b, v11.16b, #1 ++ umax v18.16b, v18.16b, v31.16b ++ umax v18.16b, v18.16b, v29.16b ++ ++ // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15 ++ SPAT_CHECK v18, v13, v24, v10, v22, v15, v31, v30, v29, v28 ++ ++ // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18 ++ INTERPOL v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29 ++ ++// dst[0] = av_clip_uint8(interpol); ++ str q2, [x0], #16 ++// } ++// ++// dst++; ++// cur++; ++// prev++; ++// prev2++; ++// next++; ++// } ++ ++ subs w10, w10, #16 ++ add x2, x2, #16 ++ add x3, x3, #16 ++ add x4, x4, #16 ++ add x17, x17, #16 ++ bgt 10b ++ ++ POP_VREGS ++99: ++ ret ++endfunc ++ ++// ============================================================================ ++// ++// void ff_bwdif_filter_edge_neon( ++// void *dst1, // x0 ++// void *prev1, // x1 ++// void *cur1, // x2 ++// void *next1, // x3 ++// int w, // w4 ++// int prefs, // w5 ++// int mrefs, // w6 ++// int prefs2, // w7 ++// int mrefs2, // [sp, #0] ++// int parity, // [sp, #SP_INT] ++// int clip_max, // [sp, #SP_INT*2] unused ++// int spat); // [sp, #SP_INT*3] ++ ++function ff_bwdif_filter_edge_neon, export=1 ++ // Sanity check w ++ cmp w4, #0 ++ ble 99f ++ ++// #define prev2 cur ++// const uint8_t * restrict next2 = parity ? prev : next; ++ ++ ldr w8, [sp, #0] // mrefs2 ++ ++ ldr w17, [sp, #SP_INT] // parity ++ ldr w16, [sp, #SP_INT*3] // spat ++ cmp w17, #0 ++ csel x17, x1, x3, ne ++ ++// for (x = 0; x < w; x++) { ++ ++10: ++// int m1 = cur[mrefs]; ++// int d = (prev2[0] + next2[0]) >> 1; ++// int p1 = cur[prefs]; ++// int temporal_diff0 = FFABS(prev2[0] - next2[0]); ++// int temporal_diff1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; ++// int temporal_diff2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; ++// int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); ++ ldr q31, [x2] ++ ldr q21, [x17] ++ uhadd v16.16b, v31.16b, v21.16b // d0 = v16 ++ uabd v17.16b, v31.16b, v21.16b // td0 = v17 ++ ldr q24, [x2, w6, sxtw] // m1 = v24 ++ ldr q22, [x2, w5, sxtw] // p1 = v22 ++ ++ ldr q0, [x1, w6, sxtw] // prev[mrefs] ++ ldr q2, [x1, w5, sxtw] // prev[prefs] ++ ldr q1, [x3, w6, sxtw] // next[mrefs] ++ ldr q3, [x3, w5, sxtw] // next[prefs] ++ ++ ushr v29.16b, v17.16b, #1 ++ ++ uabd v31.16b, v0.16b, v24.16b ++ uabd v30.16b, v2.16b, v22.16b ++ uhadd v0.16b, v31.16b, v30.16b // td1 = q0 ++ ++ uabd v31.16b, v1.16b, v24.16b ++ uabd v30.16b, v3.16b, v22.16b ++ uhadd v1.16b, v31.16b, v30.16b // td2 = q1 ++ ++ umax v0.16b, v0.16b, v29.16b ++ umax v0.16b, v0.16b, v1.16b // diff = v0 ++ ++// if (spat) { ++// SPAT_CHECK() ++// } ++// i0 = (m1 + p1) >> 1; ++ cbz w16, 1f ++ ++ ldr q31, [x2, w8, sxtw] ++ ldr q18, [x17, w8, sxtw] ++ ldr q30, [x2, w7, sxtw] ++ ldr q19, [x17, w7, sxtw] ++ uhadd v18.16b, v18.16b, v31.16b ++ uhadd v19.16b, v19.16b, v30.16b ++ ++ SPAT_CHECK v0, v18, v24, v16, v22, v19, v31, v30, v29, v28 ++ ++1: ++ uhadd v2.16b, v22.16b, v24.16b ++ ++ // i0 = v2, s0 = v2, d0 = v16, diff = v0, t0 = v31, t1 = v30 ++ DIFF_CLIP v2, v2, v16, v0, v31, v30 ++ ++// dst[0] = av_clip(interpol, 0, clip_max); ++ str q2, [x0], #16 ++ ++// dst++; ++// cur++; ++// } ++ subs w4, w4, #16 ++ add x1, x1, #16 ++ add x2, x2, #16 ++ add x3, x3, #16 ++ add x17, x17, #16 ++ bgt 10b ++ ++99: ++ ret ++endfunc ++ ++// ============================================================================ ++// ++// void ff_bwdif_filter_intra_neon( ++// void *dst1, // x0 ++// void *cur1, // x1 ++// int w, // w2 ++// int prefs, // w3 ++// int mrefs, // w4 ++// int prefs3, // w5 ++// int mrefs3, // w6 ++// int parity, // w7 unused ++// int clip_max) // [sp, #0] unused ++ ++function ff_bwdif_filter_intra_neon, export=1 ++ cmp w2, #0 ++ ble 99f ++ ++ LDR_COEFFS v0, x17 ++ ++// for (x = 0; x < w; x++) { ++10: ++ ++// interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; ++ ldr q31, [x1, w4, sxtw] ++ ldr q30, [x1, w3, sxtw] ++ ldr q29, [x1, w6, sxtw] ++ ldr q28, [x1, w5, sxtw] ++ ++ uaddl v20.8h, v31.8b, v30.8b ++ uaddl2 v21.8h, v31.16b, v30.16b ++ ++ UMULL4K v2, v3, v4, v5, v20, v21, v0.h[6] ++ ++ uaddl v20.8h, v29.8b, v28.8b ++ uaddl2 v21.8h, v29.16b, v28.16b ++ ++ UMLSL4K v2, v3, v4, v5, v20, v21, v0.h[7] ++ ++// dst[0] = av_clip(interpol, 0, clip_max); ++ SQSHRUNN v2, v2, v3, v4, v5, 13 ++ str q2, [x0], #16 ++ ++// dst++; ++// cur++; ++// } ++ ++ subs w2, w2, #16 ++ add x1, x1, #16 ++ bgt 10b ++ ++99: ++ ret ++endfunc +diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c +index 0872c6e0f2..1dd05e4d75 100644 +--- a/libavfilter/allfilters.c ++++ b/libavfilter/allfilters.c +@@ -218,6 +218,7 @@ extern AVFilter ff_vf_dedot; + extern AVFilter ff_vf_deflate; + extern AVFilter ff_vf_deflicker; + extern AVFilter ff_vf_deinterlace_qsv; ++extern AVFilter ff_vf_deinterlace_v4l2m2m; + extern AVFilter ff_vf_deinterlace_vaapi; + extern AVFilter ff_vf_dejudder; + extern AVFilter ff_vf_delogo; +@@ -377,6 +378,7 @@ extern AVFilter ff_vf_scale; + extern AVFilter ff_vf_scale_cuda; + extern AVFilter ff_vf_scale_npp; + extern AVFilter ff_vf_scale_qsv; ++extern AVFilter ff_vf_scale_v4l2m2m; + extern AVFilter ff_vf_scale_vaapi; + extern AVFilter ff_vf_scale_vulkan; + extern AVFilter ff_vf_scale2ref; +@@ -438,6 +440,7 @@ extern AVFilter ff_vf_transpose_opencl; + extern AVFilter ff_vf_transpose_vaapi; + extern AVFilter ff_vf_trim; + extern AVFilter ff_vf_unpremultiply; ++extern AVFilter ff_vf_unsand; + extern AVFilter ff_vf_unsharp; + extern AVFilter ff_vf_unsharp_opencl; + extern AVFilter ff_vf_untile; +diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c +index f6b572b3de..44fe8b679c 100644 +--- a/libavfilter/avfiltergraph.c ++++ b/libavfilter/avfiltergraph.c +@@ -32,6 +32,9 @@ + #include "libavutil/internal.h" + #include "libavutil/opt.h" + #include "libavutil/pixdesc.h" ++#if CONFIG_UNSAND_FILTER ++#include "libavutil/rpi_sand_fns.h" ++#endif + + #define FF_INTERNAL_FIELDS 1 + #include "framequeue.h" +@@ -422,6 +425,19 @@ static int formats_declared(AVFilterContext *f) + return 1; + } + ++#if CONFIG_UNSAND_FILTER ++static int has_sand_format(const AVFilterFormats * const ff) ++{ ++ int i; ++ for (i = 0; i != ff->nb_formats; ++i) { ++ if (av_rpi_is_sand_format(ff->formats[i])) { ++ return 1; ++ } ++ } ++ return 0; ++} ++#endif ++ + /** + * Perform one round of query_formats() and merging formats lists on the + * filter graph. +@@ -462,6 +478,7 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) + for (j = 0; j < filter->nb_inputs; j++) { + AVFilterLink *link = filter->inputs[j]; + int convert_needed = 0; ++ unsigned int extra_convert_tried = 0; + + if (!link) + continue; +@@ -504,11 +521,14 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) + link->outcfg.formats, link->type) + #undef MERGE_DISPATCH + +- if (convert_needed) { ++ while (convert_needed) { + AVFilterContext *convert; + const AVFilter *filter; + AVFilterLink *inlink, *outlink; + char inst_name[30]; ++ int can_retry = 0; ++ ++ convert_needed = 0; + + if (graph->disable_auto_convert) { + av_log(log_ctx, AV_LOG_ERROR, +@@ -521,19 +541,45 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) + /* couldn't merge format lists. auto-insert conversion filter */ + switch (link->type) { + case AVMEDIA_TYPE_VIDEO: +- if (!(filter = avfilter_get_by_name("scale"))) { +- av_log(log_ctx, AV_LOG_ERROR, "'scale' filter " +- "not present, cannot convert pixel formats.\n"); +- return AVERROR(EINVAL); +- } +- +- snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d", +- scaler_count++); ++#if CONFIG_UNSAND_FILTER ++ // Only try each extra conversion once ++ // The unsand output pad should never trigger has_sand_format ++ // but it is better to be safe ++ if ((extra_convert_tried & 1) == 0 && has_sand_format(link->incfg.formats)) { ++ if (!(filter = avfilter_get_by_name("unsand"))) { ++ av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter " ++ "not present, cannot convert pixel formats.\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d", ++ scaler_count++); ++ ++ if ((ret = avfilter_graph_create_filter(&convert, filter, ++ inst_name, "", NULL, ++ graph)) < 0) ++ return ret; + +- if ((ret = avfilter_graph_create_filter(&convert, filter, +- inst_name, graph->scale_sws_opts, NULL, +- graph)) < 0) +- return ret; ++ extra_convert_tried |= 1; ++ can_retry = 1; ++ } ++ else ++#endif ++ { ++ if (!(filter = avfilter_get_by_name("scale"))) { ++ av_log(log_ctx, AV_LOG_ERROR, "'scale' filter " ++ "not present, cannot convert pixel formats.\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d", ++ scaler_count++); ++ ++ if ((ret = avfilter_graph_create_filter(&convert, filter, ++ inst_name, graph->scale_sws_opts, NULL, ++ graph)) < 0) ++ return ret; ++ } + break; + case AVMEDIA_TYPE_AUDIO: + if (!(filter = avfilter_get_by_name("aresample"))) { +@@ -589,6 +635,13 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) + outlink->outcfg.samplerates) || + CHECKED_MERGE(channel_layouts, outlink->incfg.channel_layouts, + outlink->outcfg.channel_layouts))) { ++ // Try adding an unsand filter & see if that helps ++ if (ret < 0 && can_retry) { ++ link = outlink; ++ convert_needed = 1; ++ continue; ++ } ++ + if (ret < 0) + return ret; + av_log(log_ctx, AV_LOG_ERROR, +diff --git a/libavfilter/buffersink.c b/libavfilter/buffersink.c +index 15d897cff6..c134759bbf 100644 +--- a/libavfilter/buffersink.c ++++ b/libavfilter/buffersink.c +@@ -58,6 +58,11 @@ typedef struct BufferSinkContext { + int sample_rates_size; + + AVFrame *peeked_frame; ++ ++ union { ++ av_buffersink_alloc_video_frame * video; ++ } alloc_cb; ++ void * alloc_v; + } BufferSinkContext; + + #define NB_ITEMS(list) (list ## _size / sizeof(*list)) +@@ -148,6 +153,22 @@ int attribute_align_arg av_buffersink_get_samples(AVFilterContext *ctx, + return get_frame_internal(ctx, frame, 0, nb_samples); + } + ++static AVFrame * alloc_video_buffer(AVFilterLink *link, int w, int h) ++{ ++ AVFilterContext * const ctx = link->dst; ++ BufferSinkContext * const bs = ctx->priv; ++ return bs->alloc_cb.video ? bs->alloc_cb.video(ctx, bs->alloc_v, w, h) : ++ ff_default_get_video_buffer(link, w, h); ++} ++ ++int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v) ++{ ++ BufferSinkContext * const bs = ctx->priv; ++ bs->alloc_cb.video = cb; ++ bs->alloc_v = v; ++ return 0; ++} ++ + #if FF_API_BUFFERSINK_ALLOC + AVBufferSinkParams *av_buffersink_params_alloc(void) + { +@@ -331,6 +352,7 @@ static const AVFilterPad avfilter_vsink_buffer_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, ++ .get_video_buffer = alloc_video_buffer, + }, + { NULL } + }; +diff --git a/libavfilter/buffersink.h b/libavfilter/buffersink.h +index 69ed0f29a8..a3aa6fcb3c 100644 +--- a/libavfilter/buffersink.h ++++ b/libavfilter/buffersink.h +@@ -198,6 +198,9 @@ int av_buffersink_get_frame(AVFilterContext *ctx, AVFrame *frame); + */ + int av_buffersink_get_samples(AVFilterContext *ctx, AVFrame *frame, int nb_samples); + ++typedef AVFrame * av_buffersink_alloc_video_frame(AVFilterContext * ctx, void * v, int w, int h); ++int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v); ++ + /** + * @} + */ +diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c +index da1cf9941e..c588ed23cb 100644 +--- a/libavfilter/buffersrc.c ++++ b/libavfilter/buffersrc.c +@@ -188,7 +188,7 @@ int attribute_align_arg av_buffersrc_add_frame_flags(AVFilterContext *ctx, AVFra + + switch (ctx->outputs[0]->type) { + case AVMEDIA_TYPE_VIDEO: +- CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height, ++ CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame), + frame->format, frame->pts); + break; + case AVMEDIA_TYPE_AUDIO: +diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h +index 889ff772ed..496cec72ef 100644 +--- a/libavfilter/bwdif.h ++++ b/libavfilter/bwdif.h +@@ -35,8 +35,29 @@ typedef struct BWDIFContext { + void (*filter_edge)(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int parity, int clip_max, int spat); ++ void (*filter_line3)(void *dst, int dstride, ++ const void *prev, const void *cur, const void *next, int prefs, ++ int w, int parity, int clip_max); + } BWDIFContext; + +-void ff_bwdif_init_x86(BWDIFContext *bwdif); ++void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth); ++void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth); ++void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth); ++ ++void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int parity, int clip_max, int spat); ++ ++void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs, ++ int prefs3, int mrefs3, int parity, int clip_max); ++ ++void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int prefs3, int mrefs3, int prefs4, int mrefs4, ++ int parity, int clip_max); ++ ++void ff_bwdif_filter_line3_c(void * dst1, int d_stride, ++ const void * prev1, const void * cur1, const void * next1, int s_stride, ++ int w, int parity, int clip_max); + + #endif /* AVFILTER_BWDIF_H */ +diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c +index b6aed7a450..b268113271 100644 +--- a/libavfilter/vf_bwdif.c ++++ b/libavfilter/vf_bwdif.c +@@ -123,8 +123,8 @@ typedef struct ThreadData { + next2++; \ + } + +-static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs, +- int prefs3, int mrefs3, int parity, int clip_max) ++void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs, ++ int prefs3, int mrefs3, int parity, int clip_max) + { + uint8_t *dst = dst1; + uint8_t *cur = cur1; +@@ -133,10 +133,10 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs, + FILTER_INTRA() + } + +-static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, +- int w, int prefs, int mrefs, int prefs2, int mrefs2, +- int prefs3, int mrefs3, int prefs4, int mrefs4, +- int parity, int clip_max) ++void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int prefs3, int mrefs3, int prefs4, int mrefs4, ++ int parity, int clip_max) + { + uint8_t *dst = dst1; + uint8_t *prev = prev1; +@@ -151,9 +151,34 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, + FILTER2() + } + +-static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1, +- int w, int prefs, int mrefs, int prefs2, int mrefs2, +- int parity, int clip_max, int spat) ++#define NEXT_LINE()\ ++ dst += d_stride; \ ++ prev += prefs; \ ++ cur += prefs; \ ++ next += prefs; ++ ++void ff_bwdif_filter_line3_c(void * dst1, int d_stride, ++ const void * prev1, const void * cur1, const void * next1, int s_stride, ++ int w, int parity, int clip_max) ++{ ++ const int prefs = s_stride; ++ uint8_t * dst = dst1; ++ const uint8_t * prev = prev1; ++ const uint8_t * cur = cur1; ++ const uint8_t * next = next1; ++ ++ ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w, ++ prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max); ++ NEXT_LINE(); ++ memcpy(dst, cur, w); ++ NEXT_LINE(); ++ ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w, ++ prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max); ++} ++ ++void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int parity, int clip_max, int spat) + { + uint8_t *dst = dst1; + uint8_t *prev = prev1; +@@ -213,6 +238,13 @@ static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1, + FILTER2() + } + ++// Round job start line down to multiple of 4 so that if filter_line3 exists ++// and the frame is a multiple of 4 high then filter_line will never be called ++static inline int job_start(const int jobnr, const int nb_jobs, const int h) ++{ ++ return jobnr >= nb_jobs ? h : ((h * jobnr) / nb_jobs) & ~3; ++} ++ + static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) + { + BWDIFContext *s = ctx->priv; +@@ -222,8 +254,8 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) + int clip_max = (1 << (yadif->csp->comp[td->plane].depth)) - 1; + int df = (yadif->csp->comp[td->plane].depth + 7) / 8; + int refs = linesize / df; +- int slice_start = (td->h * jobnr ) / nb_jobs; +- int slice_end = (td->h * (jobnr+1)) / nb_jobs; ++ int slice_start = job_start(jobnr, nb_jobs, td->h); ++ int slice_end = job_start(jobnr + 1, nb_jobs, td->h); + int y; + + for (y = slice_start; y < slice_end; y++) { +@@ -245,6 +277,11 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) + refs << 1, -(refs << 1), + td->parity ^ td->tff, clip_max, + (y < 2) || ((y + 3) > td->h) ? 0 : 1); ++ } else if (s->filter_line3 && y + 2 < slice_end && y + 6 < td->h) { ++ s->filter_line3(dst, td->frame->linesize[td->plane], ++ prev, cur, next, linesize, td->w, ++ td->parity ^ td->tff, clip_max); ++ y += 2; + } else { + s->filter_line(dst, prev, cur, next, td->w, + refs, -refs, refs << 1, -(refs << 1), +@@ -280,7 +317,8 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic, + td.h = h; + td.plane = i; + +- ctx->internal->execute(ctx, filter_slice, &td, NULL, FFMIN(h, ff_filter_get_nb_threads(ctx))); ++ ctx->internal->execute(ctx, filter_slice, &td, NULL, ++ FFMIN((h+3)/4, ff_filter_get_nb_threads(ctx))); + } + if (yadif->current_field == YADIF_FIELD_END) { + yadif->current_field = YADIF_FIELD_NORMAL; +@@ -350,20 +388,29 @@ static int config_props(AVFilterLink *link) + + yadif->csp = av_pix_fmt_desc_get(link->format); + yadif->filter = filter; +- if (yadif->csp->comp[0].depth > 8) { ++ ff_bwdif_init_filter_line(s, yadif->csp->comp[0].depth); ++ ++ return 0; ++} ++ ++av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth) ++{ ++ s->filter_line3 = 0; ++ if (bit_depth > 8) { + s->filter_intra = filter_intra_16bit; + s->filter_line = filter_line_c_16bit; + s->filter_edge = filter_edge_16bit; + } else { +- s->filter_intra = filter_intra; +- s->filter_line = filter_line_c; +- s->filter_edge = filter_edge; ++ s->filter_intra = ff_bwdif_filter_intra_c; ++ s->filter_line = ff_bwdif_filter_line_c; ++ s->filter_edge = ff_bwdif_filter_edge_c; + } + +- if (ARCH_X86) +- ff_bwdif_init_x86(s); +- +- return 0; ++#if ARCH_X86 ++ ff_bwdif_init_x86(s, bit_depth); ++#elif ARCH_AARCH64 ++ ff_bwdif_init_aarch64(s, bit_depth); ++#endif + } + + +diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c +new file mode 100644 +index 0000000000..d4c11cfc51 +--- /dev/null ++++ b/libavfilter/vf_deinterlace_v4l2m2m.c +@@ -0,0 +1,2115 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/** ++ * @file ++ * deinterlace video filter - V4L2 M2M ++ */ ++ ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "config.h" ++ ++#include "libavutil/avassert.h" ++#include "libavutil/avstring.h" ++#include "libavutil/common.h" ++#include "libavutil/hwcontext.h" ++#include "libavutil/hwcontext_drm.h" ++#include "libavutil/internal.h" ++#include "libavutil/mathematics.h" ++#include "libavutil/opt.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/time.h" ++ ++#define FF_INTERNAL_FIELDS 1 ++#include "framequeue.h" ++#include "filters.h" ++#include "avfilter.h" ++#include "formats.h" ++#include "internal.h" ++#include "scale_eval.h" ++#include "video.h" ++ ++#ifndef DRM_FORMAT_P030 ++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */ ++#endif ++ ++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined ++// in drm_fourcc.h hopefully will be sometime in the future but until then... ++#ifndef V4L2_PIX_FMT_NV12_10_COL128 ++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') ++#endif ++ ++#ifndef V4L2_PIX_FMT_NV12_COL128 ++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ ++#endif ++ ++typedef struct V4L2Queue V4L2Queue; ++typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared; ++ ++typedef enum filter_type_v4l2_e ++{ ++ FILTER_V4L2_DEINTERLACE = 1, ++ FILTER_V4L2_SCALE, ++} filter_type_v4l2_t; ++ ++typedef struct V4L2Buffer { ++ int enqueued; ++ int reenqueue; ++ struct v4l2_buffer buffer; ++ AVFrame frame; ++ struct v4l2_plane planes[VIDEO_MAX_PLANES]; ++ int num_planes; ++ AVDRMFrameDescriptor drm_frame; ++ V4L2Queue *q; ++} V4L2Buffer; ++ ++typedef struct V4L2Queue { ++ struct v4l2_format format; ++ struct v4l2_selection sel; ++ int eos; ++ int num_buffers; ++ V4L2Buffer *buffers; ++ const char * name; ++ DeintV4L2M2MContextShared *ctx; ++} V4L2Queue; ++ ++typedef struct pts_stats_s ++{ ++ void * logctx; ++ const char * name; // For debug ++ unsigned int last_count; ++ unsigned int last_interval; ++ int64_t last_pts; ++} pts_stats_t; ++ ++#define PTS_TRACK_SIZE 32 ++typedef struct pts_track_el_s ++{ ++ uint32_t n; ++ unsigned int interval; ++ AVFrame * props; ++} pts_track_el_t; ++ ++typedef struct pts_track_s ++{ ++ uint32_t n; ++ uint32_t last_n; ++ int got_2; ++ void * logctx; ++ pts_stats_t stats; ++ pts_track_el_t a[PTS_TRACK_SIZE]; ++} pts_track_t; ++ ++typedef enum drain_state_e ++{ ++ DRAIN_NONE = 0, // Not draining ++ DRAIN_TIMEOUT, // Drain until normal timeout setup yields no frame ++ DRAIN_LAST, // Drain with long timeout last_frame in received on output expected ++ DRAIN_EOS, // Drain with long timeout EOS expected ++ DRAIN_DONE // Drained ++} drain_state_t; ++ ++typedef struct DeintV4L2M2MContextShared { ++ void * logctx; // For logging - will be NULL when done ++ filter_type_v4l2_t filter_type; ++ ++ int fd; ++ int done; // fd closed - awating all refs dropped ++ int width; ++ int height; ++ ++ int drain; // EOS received (inlink status) ++ drain_state_t drain_state; ++ int64_t drain_pts; // PTS associated with inline status ++ ++ unsigned int frames_rx; ++ unsigned int frames_tx; ++ ++ // from options ++ int output_width; ++ int output_height; ++ enum AVPixelFormat output_format; ++ ++ int has_enc_stop; ++ // We expect to get exactly the same number of frames out as we put in ++ // We can drain by matching input to output ++ int one_to_one; ++ ++ int orig_width; ++ int orig_height; ++ atomic_uint refcount; ++ ++ AVBufferRef *hw_frames_ctx; ++ ++ unsigned int field_order; ++ ++ pts_track_t track; ++ ++ V4L2Queue output; ++ V4L2Queue capture; ++} DeintV4L2M2MContextShared; ++ ++typedef struct DeintV4L2M2MContext { ++ const AVClass *class; ++ ++ DeintV4L2M2MContextShared *shared; ++ ++ char * w_expr; ++ char * h_expr; ++ char * output_format_string;; ++ ++ int force_original_aspect_ratio; ++ int force_divisible_by; ++ ++ char *colour_primaries_string; ++ char *colour_transfer_string; ++ char *colour_matrix_string; ++ int colour_range; ++ char *chroma_location_string; ++ ++ enum AVColorPrimaries colour_primaries; ++ enum AVColorTransferCharacteristic colour_transfer; ++ enum AVColorSpace colour_matrix; ++ enum AVChromaLocation chroma_location; ++} DeintV4L2M2MContext; ++ ++ ++static inline int drain_frame_expected(const drain_state_t d) ++{ ++ return d == DRAIN_EOS || d == DRAIN_LAST; ++} ++ ++// These just list the ones we know we can cope with ++static uint32_t ++fmt_av_to_v4l2(const enum AVPixelFormat avfmt) ++{ ++ switch (avfmt) { ++ case AV_PIX_FMT_YUV420P: ++ return V4L2_PIX_FMT_YUV420; ++ case AV_PIX_FMT_NV12: ++ return V4L2_PIX_FMT_NV12; ++#if CONFIG_SAND ++ case AV_PIX_FMT_RPI4_8: ++ case AV_PIX_FMT_SAND128: ++ return V4L2_PIX_FMT_NV12_COL128; ++#endif ++ default: ++ break; ++ } ++ return 0; ++} ++ ++static enum AVPixelFormat ++fmt_v4l2_to_av(const uint32_t pixfmt) ++{ ++ switch (pixfmt) { ++ case V4L2_PIX_FMT_YUV420: ++ return AV_PIX_FMT_YUV420P; ++ case V4L2_PIX_FMT_NV12: ++ return AV_PIX_FMT_NV12; ++#if CONFIG_SAND ++ case V4L2_PIX_FMT_NV12_COL128: ++ return AV_PIX_FMT_RPI4_8; ++#endif ++ default: ++ break; ++ } ++ return AV_PIX_FMT_NONE; ++} ++ ++static unsigned int pts_stats_interval(const pts_stats_t * const stats) ++{ ++ return stats->last_interval; ++} ++ ++// Pick 64 for max last count - that is >1sec at 60fps ++#define STATS_LAST_COUNT_MAX 64 ++#define STATS_INTERVAL_MAX (1 << 30) ++static void pts_stats_add(pts_stats_t * const stats, int64_t pts) ++{ ++ if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) { ++ if (stats->last_count < STATS_LAST_COUNT_MAX) ++ ++stats->last_count; ++ return; ++ } ++ ++ if (stats->last_pts != AV_NOPTS_VALUE) { ++ const int64_t interval = pts - stats->last_pts; ++ ++ if (interval < 0 || interval >= STATS_INTERVAL_MAX || ++ stats->last_count >= STATS_LAST_COUNT_MAX) { ++ if (stats->last_interval != 0) ++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n", ++ __func__, stats->name, interval, stats->last_count); ++ stats->last_interval = 0; ++ } ++ else { ++ const int64_t frame_time = interval / (int64_t)stats->last_count; ++ ++ if (frame_time != stats->last_interval) ++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n", ++ __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time); ++ stats->last_interval = frame_time; ++ } ++ } ++ ++ stats->last_pts = pts; ++ stats->last_count = 1; ++} ++ ++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name) ++{ ++ *stats = (pts_stats_t){ ++ .logctx = logctx, ++ .name = name, ++ .last_count = 1, ++ .last_interval = 0, ++ .last_pts = AV_NOPTS_VALUE ++ }; ++} ++ ++static inline uint32_t pts_track_next_n(pts_track_t * const trk) ++{ ++ if (++trk->n == 0) ++ trk->n = 1; ++ return trk->n; ++} ++ ++static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst) ++{ ++ uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000); ++ pts_track_el_t * t; ++ ++ // As a first guess assume that n==0 means last frame ++ if (n == 0) { ++ n = trk->last_n; ++ if (n == 0) ++ goto fail; ++ } ++ ++ t = trk->a + (n & (PTS_TRACK_SIZE - 1)); ++ ++ if (t->n != n) { ++ av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n); ++ goto fail; ++ } ++ ++ // 1st frame is simple - just believe it ++ if (n != trk->last_n) { ++ trk->last_n = n; ++ trk->got_2 = 0; ++ return av_frame_copy_props(dst, t->props); ++ } ++ ++ // Only believe in a single interpolated frame ++ if (trk->got_2) ++ goto fail; ++ trk->got_2 = 1; ++ ++ av_frame_copy_props(dst, t->props); ++ ++ ++ // If we can't guess - don't ++ if (t->interval == 0) { ++ dst->best_effort_timestamp = AV_NOPTS_VALUE; ++ dst->pts = AV_NOPTS_VALUE; ++ dst->pkt_dts = AV_NOPTS_VALUE; ++ } ++ else { ++ if (dst->best_effort_timestamp != AV_NOPTS_VALUE) ++ dst->best_effort_timestamp += t->interval / 2; ++ if (dst->pts != AV_NOPTS_VALUE) ++ dst->pts += t->interval / 2; ++ if (dst->pkt_dts != AV_NOPTS_VALUE) ++ dst->pkt_dts += t->interval / 2; ++ } ++ ++ return 0; ++ ++fail: ++ trk->last_n = 0; ++ trk->got_2 = 0; ++ dst->pts = AV_NOPTS_VALUE; ++ dst->pkt_dts = AV_NOPTS_VALUE; ++ return 0; ++} ++ ++// We are only ever expecting in-order frames so nothing more clever is required ++static unsigned int ++pts_track_count(const pts_track_t * const trk) ++{ ++ return (trk->n - trk->last_n) & (PTS_TRACK_SIZE - 1); ++} ++ ++static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src) ++{ ++ const uint32_t n = pts_track_next_n(trk); ++ pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1)); ++ ++ pts_stats_add(&trk->stats, src->pts); ++ ++ t->n = n; ++ t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last ++ av_frame_unref(t->props); ++ av_frame_copy_props(t->props, src); ++ ++ // We now know what the previous interval was, rather than having to guess, ++ // so set it. There is a better than decent chance that this is before ++ // we use it. ++ if (t->interval != 0) { ++ pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1)); ++ prev_t->interval = t->interval; ++ } ++ ++ // In case deinterlace interpolates frames use every other usec ++ return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2}; ++} ++ ++static void pts_track_uninit(pts_track_t * const trk) ++{ ++ unsigned int i; ++ for (i = 0; i != PTS_TRACK_SIZE; ++i) { ++ trk->a[i].n = 0; ++ av_frame_free(&trk->a[i].props); ++ } ++} ++ ++static int pts_track_init(pts_track_t * const trk, void *logctx) ++{ ++ unsigned int i; ++ trk->n = 1; ++ pts_stats_init(&trk->stats, logctx, "track"); ++ for (i = 0; i != PTS_TRACK_SIZE; ++i) { ++ trk->a[i].n = 0; ++ if ((trk->a[i].props = av_frame_alloc()) == NULL) { ++ pts_track_uninit(trk); ++ return AVERROR(ENOMEM); ++ } ++ } ++ return 0; ++} ++ ++static inline uint32_t ++fmt_bpl(const struct v4l2_format * const fmt, const unsigned int plane_n) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.plane_fmt[plane_n].bytesperline : fmt->fmt.pix.bytesperline; ++} ++ ++static inline uint32_t ++fmt_height(const struct v4l2_format * const fmt) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; ++} ++ ++static inline uint32_t ++fmt_width(const struct v4l2_format * const fmt) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; ++} ++ ++static inline uint32_t ++fmt_pixelformat(const struct v4l2_format * const fmt) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat; ++} ++ ++static inline uint32_t ++buf_bytesused0(const struct v4l2_buffer * const buf) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(buf->type) ? buf->m.planes[0].bytesused : buf->bytesused; ++} ++ ++static void ++init_format(V4L2Queue * const q, const uint32_t format_type) ++{ ++ memset(&q->format, 0, sizeof(q->format)); ++ memset(&q->sel, 0, sizeof(q->sel)); ++ q->format.type = format_type; ++ q->sel.type = format_type; ++} ++ ++static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) ++{ ++ struct v4l2_capability cap; ++ int ret; ++ ++ memset(&cap, 0, sizeof(cap)); ++ ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap); ++ if (ret < 0) ++ return ret; ++ ++ if (ctx->filter_type == FILTER_V4L2_SCALE && ++ strcmp("bcm2835-codec-isp", cap.card) != 0) ++ { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Not ISP\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ if (!(cap.capabilities & V4L2_CAP_STREAMING)) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "No streaming\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) { ++ init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE); ++ init_format(&ctx->output, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE); ++ } ++ else if (cap.capabilities & V4L2_CAP_VIDEO_M2M) { ++ init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE); ++ init_format(&ctx->output, V4L2_BUF_TYPE_VIDEO_OUTPUT); ++ } ++ else { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Not M2M\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ return 0; ++} ++ ++// Just use for probe - doesn't modify q format ++static int deint_v4l2m2m_try_format(V4L2Queue *queue, const uint32_t width, const uint32_t height, const enum AVPixelFormat avfmt) ++{ ++ struct v4l2_format fmt = {.type = queue->format.type}; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ int ret, field; ++ // Pick YUV to test with if not otherwise specified ++ uint32_t pixelformat = avfmt == AV_PIX_FMT_NONE ? V4L2_PIX_FMT_YUV420 : fmt_av_to_v4l2(avfmt); ++ enum AVPixelFormat r_avfmt; ++ ++ ++ ret = ioctl(ctx->fd, VIDIOC_G_FMT, &fmt); ++ if (ret) ++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret); ++ ++ if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && V4L2_TYPE_IS_OUTPUT(fmt.type)) ++ field = V4L2_FIELD_INTERLACED_TB; ++ else ++ field = V4L2_FIELD_NONE; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) { ++ fmt.fmt.pix_mp.pixelformat = pixelformat; ++ fmt.fmt.pix_mp.field = field; ++ fmt.fmt.pix_mp.width = width; ++ fmt.fmt.pix_mp.height = height; ++ } else { ++ fmt.fmt.pix.pixelformat = pixelformat; ++ fmt.fmt.pix.field = field; ++ fmt.fmt.pix.width = width; ++ fmt.fmt.pix.height = height; ++ } ++ ++ av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__, ++ fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height, ++ fmt.fmt.pix_mp.pixelformat, ++ fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline); ++ ++ ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, &fmt); ++ if (ret) ++ return AVERROR(EINVAL); ++ ++ av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__, ++ fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height, ++ fmt.fmt.pix_mp.pixelformat, ++ fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline); ++ ++ r_avfmt = fmt_v4l2_to_av(fmt_pixelformat(&fmt)); ++ if (r_avfmt != avfmt && avfmt != AV_PIX_FMT_NONE) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Unable to set format %s on %s port\n", av_get_pix_fmt_name(avfmt), V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src"); ++ return AVERROR(EINVAL); ++ } ++ if (r_avfmt == AV_PIX_FMT_NONE) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "No supported format on %s port\n", V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src"); ++ return AVERROR(EINVAL); ++ } ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) { ++ if (fmt.fmt.pix_mp.field != field) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type); ++ ++ return AVERROR(EINVAL); ++ } ++ } else { ++ if (fmt.fmt.pix.field != field) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type); ++ ++ return AVERROR(EINVAL); ++ } ++ } ++ ++ return 0; ++} ++ ++static int ++do_s_fmt(V4L2Queue * const q) ++{ ++ DeintV4L2M2MContextShared * const ctx = q->ctx; ++ const uint32_t pixelformat = fmt_pixelformat(&q->format); ++ int ret; ++ ++ ret = ioctl(ctx->fd, VIDIOC_S_FMT, &q->format); ++ if (ret) { ++ ret = AVERROR(errno); ++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %s\n", av_err2str(ret)); ++ return ret; ++ } ++ ++ if (pixelformat != fmt_pixelformat(&q->format)) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt_pixelformat(&q->format))); ++ return AVERROR(EINVAL); ++ } ++ ++ q->sel.target = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE, ++ q->sel.flags = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_FLAG_LE : V4L2_SEL_FLAG_GE; ++ ++ ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &q->sel); ++ if (ret) { ++ ret = AVERROR(errno); ++ av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %s\n", av_err2str(ret)); ++ } ++ ++ return 0; ++} ++ ++static void ++set_fmt_color(struct v4l2_format *const fmt, ++ const enum AVColorPrimaries avcp, ++ const enum AVColorSpace avcs, ++ const enum AVColorTransferCharacteristic avxc) ++{ ++ enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT; ++ enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT; ++ enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT; ++ ++ switch (avcp) { ++ case AVCOL_PRI_BT709: ++ cs = V4L2_COLORSPACE_REC709; ++ ycbcr = V4L2_YCBCR_ENC_709; ++ break; ++ case AVCOL_PRI_BT470M: ++ cs = V4L2_COLORSPACE_470_SYSTEM_M; ++ ycbcr = V4L2_YCBCR_ENC_601; ++ break; ++ case AVCOL_PRI_BT470BG: ++ cs = V4L2_COLORSPACE_470_SYSTEM_BG; ++ break; ++ case AVCOL_PRI_SMPTE170M: ++ cs = V4L2_COLORSPACE_SMPTE170M; ++ break; ++ case AVCOL_PRI_SMPTE240M: ++ cs = V4L2_COLORSPACE_SMPTE240M; ++ break; ++ case AVCOL_PRI_BT2020: ++ cs = V4L2_COLORSPACE_BT2020; ++ break; ++ case AVCOL_PRI_SMPTE428: ++ case AVCOL_PRI_SMPTE431: ++ case AVCOL_PRI_SMPTE432: ++ case AVCOL_PRI_EBU3213: ++ case AVCOL_PRI_RESERVED: ++ case AVCOL_PRI_FILM: ++ case AVCOL_PRI_UNSPECIFIED: ++ default: ++ break; ++ } ++ ++ switch (avcs) { ++ case AVCOL_SPC_RGB: ++ cs = V4L2_COLORSPACE_SRGB; ++ break; ++ case AVCOL_SPC_BT709: ++ cs = V4L2_COLORSPACE_REC709; ++ break; ++ case AVCOL_SPC_FCC: ++ cs = V4L2_COLORSPACE_470_SYSTEM_M; ++ break; ++ case AVCOL_SPC_BT470BG: ++ cs = V4L2_COLORSPACE_470_SYSTEM_BG; ++ break; ++ case AVCOL_SPC_SMPTE170M: ++ cs = V4L2_COLORSPACE_SMPTE170M; ++ break; ++ case AVCOL_SPC_SMPTE240M: ++ cs = V4L2_COLORSPACE_SMPTE240M; ++ break; ++ case AVCOL_SPC_BT2020_CL: ++ cs = V4L2_COLORSPACE_BT2020; ++ ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM; ++ break; ++ case AVCOL_SPC_BT2020_NCL: ++ cs = V4L2_COLORSPACE_BT2020; ++ break; ++ default: ++ break; ++ } ++ ++ switch (xfer) { ++ case AVCOL_TRC_BT709: ++ xfer = V4L2_XFER_FUNC_709; ++ break; ++ case AVCOL_TRC_IEC61966_2_1: ++ xfer = V4L2_XFER_FUNC_SRGB; ++ break; ++ case AVCOL_TRC_SMPTE240M: ++ xfer = V4L2_XFER_FUNC_SMPTE240M; ++ break; ++ case AVCOL_TRC_SMPTE2084: ++ xfer = V4L2_XFER_FUNC_SMPTE2084; ++ break; ++ default: ++ break; ++ } ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ fmt->fmt.pix_mp.colorspace = cs; ++ fmt->fmt.pix_mp.ycbcr_enc = ycbcr; ++ fmt->fmt.pix_mp.xfer_func = xfer; ++ } else { ++ fmt->fmt.pix.colorspace = cs; ++ fmt->fmt.pix.ycbcr_enc = ycbcr; ++ fmt->fmt.pix.xfer_func = xfer; ++ } ++} ++ ++static void ++set_fmt_color_range(struct v4l2_format *const fmt, const enum AVColorRange avcr) ++{ ++ const enum v4l2_quantization q = ++ avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE : ++ avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE : ++ V4L2_QUANTIZATION_DEFAULT; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ fmt->fmt.pix_mp.quantization = q; ++ } else { ++ fmt->fmt.pix.quantization = q; ++ } ++} ++ ++static enum AVColorPrimaries get_color_primaries(const struct v4l2_format *const fmt) ++{ ++ enum v4l2_ycbcr_encoding ycbcr; ++ enum v4l2_colorspace cs; ++ ++ cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.colorspace : ++ fmt->fmt.pix.colorspace; ++ ++ ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.ycbcr_enc: ++ fmt->fmt.pix.ycbcr_enc; ++ ++ switch(ycbcr) { ++ case V4L2_YCBCR_ENC_XV709: ++ case V4L2_YCBCR_ENC_709: return AVCOL_PRI_BT709; ++ case V4L2_YCBCR_ENC_XV601: ++ case V4L2_YCBCR_ENC_601:return AVCOL_PRI_BT470M; ++ default: ++ break; ++ } ++ ++ switch(cs) { ++ case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_PRI_BT470BG; ++ case V4L2_COLORSPACE_SMPTE170M: return AVCOL_PRI_SMPTE170M; ++ case V4L2_COLORSPACE_SMPTE240M: return AVCOL_PRI_SMPTE240M; ++ case V4L2_COLORSPACE_BT2020: return AVCOL_PRI_BT2020; ++ default: ++ break; ++ } ++ ++ return AVCOL_PRI_UNSPECIFIED; ++} ++ ++static enum AVColorSpace get_color_space(const struct v4l2_format *const fmt) ++{ ++ enum v4l2_ycbcr_encoding ycbcr; ++ enum v4l2_colorspace cs; ++ ++ cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.colorspace : ++ fmt->fmt.pix.colorspace; ++ ++ ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.ycbcr_enc: ++ fmt->fmt.pix.ycbcr_enc; ++ ++ switch(cs) { ++ case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB; ++ case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709; ++ case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC; ++ case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG; ++ case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M; ++ case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M; ++ case V4L2_COLORSPACE_BT2020: ++ if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM) ++ return AVCOL_SPC_BT2020_CL; ++ else ++ return AVCOL_SPC_BT2020_NCL; ++ default: ++ break; ++ } ++ ++ return AVCOL_SPC_UNSPECIFIED; ++} ++ ++static enum AVColorTransferCharacteristic get_color_trc(const struct v4l2_format *const fmt) ++{ ++ enum v4l2_ycbcr_encoding ycbcr; ++ enum v4l2_xfer_func xfer; ++ enum v4l2_colorspace cs; ++ ++ cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.colorspace : ++ fmt->fmt.pix.colorspace; ++ ++ ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.ycbcr_enc: ++ fmt->fmt.pix.ycbcr_enc; ++ ++ xfer = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.xfer_func: ++ fmt->fmt.pix.xfer_func; ++ ++ switch (xfer) { ++ case V4L2_XFER_FUNC_709: return AVCOL_TRC_BT709; ++ case V4L2_XFER_FUNC_SRGB: return AVCOL_TRC_IEC61966_2_1; ++ default: ++ break; ++ } ++ ++ switch (cs) { ++ case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_TRC_GAMMA22; ++ case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_TRC_GAMMA28; ++ case V4L2_COLORSPACE_SMPTE170M: return AVCOL_TRC_SMPTE170M; ++ case V4L2_COLORSPACE_SMPTE240M: return AVCOL_TRC_SMPTE240M; ++ default: ++ break; ++ } ++ ++ switch (ycbcr) { ++ case V4L2_YCBCR_ENC_XV709: ++ case V4L2_YCBCR_ENC_XV601: return AVCOL_TRC_BT1361_ECG; ++ default: ++ break; ++ } ++ ++ return AVCOL_TRC_UNSPECIFIED; ++} ++ ++static enum AVColorRange get_color_range(const struct v4l2_format *const fmt) ++{ ++ enum v4l2_quantization qt; ++ ++ qt = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.quantization : ++ fmt->fmt.pix.quantization; ++ ++ switch (qt) { ++ case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG; ++ case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG; ++ default: ++ break; ++ } ++ ++ return AVCOL_RANGE_UNSPECIFIED; ++} ++ ++static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame) ++{ ++ struct v4l2_format *const format = &q->format; ++ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0]; ++ ++ const uint32_t drm_fmt = src->layers[0].format; ++ // Treat INVALID as LINEAR ++ const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ? ++ DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier; ++ uint32_t pix_fmt = 0; ++ uint32_t w = 0; ++ uint32_t h = 0; ++ uint32_t bpl = src->layers[0].planes[0].pitch; ++ ++ // We really don't expect multiple layers ++ // All formats that we currently cope with are single object ++ ++ if (src->nb_layers != 1 || src->nb_objects != 1) ++ return AVERROR(EINVAL); ++ ++ switch (drm_fmt) { ++ case DRM_FORMAT_YUV420: ++ if (mod == DRM_FORMAT_MOD_LINEAR) { ++ if (src->layers[0].nb_planes != 3) ++ break; ++ pix_fmt = V4L2_PIX_FMT_YUV420; ++ h = src->layers[0].planes[1].offset / bpl; ++ w = bpl; ++ } ++ break; ++ ++ case DRM_FORMAT_NV12: ++ if (mod == DRM_FORMAT_MOD_LINEAR) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12; ++ h = src->layers[0].planes[1].offset / bpl; ++ w = bpl; ++ } ++#if CONFIG_SAND ++ else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12_COL128; ++ w = bpl; ++ h = src->layers[0].planes[1].offset / 128; ++ bpl = fourcc_mod_broadcom_param(mod); ++ } ++#endif ++ break; ++ ++ case DRM_FORMAT_P030: ++#if CONFIG_SAND ++ if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12_10_COL128; ++ w = bpl / 2; // Matching lie to how we construct this ++ h = src->layers[0].planes[1].offset / 128; ++ bpl = fourcc_mod_broadcom_param(mod); ++ } ++#endif ++ break; ++ ++ default: ++ break; ++ } ++ ++ if (!pix_fmt) ++ return AVERROR(EINVAL); ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) { ++ struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp; ++ ++ pix->width = w; ++ pix->height = h; ++ pix->pixelformat = pix_fmt; ++ pix->plane_fmt[0].bytesperline = bpl; ++ pix->num_planes = 1; ++ } ++ else { ++ struct v4l2_pix_format *const pix = &format->fmt.pix; ++ ++ pix->width = w; ++ pix->height = h; ++ pix->pixelformat = pix_fmt; ++ pix->bytesperline = bpl; ++ } ++ ++ set_fmt_color(format, frame->color_primaries, frame->colorspace, frame->color_trc); ++ set_fmt_color_range(format, frame->color_range); ++ ++ q->sel.r.width = frame->width - (frame->crop_left + frame->crop_right); ++ q->sel.r.height = frame->height - (frame->crop_top + frame->crop_bottom); ++ q->sel.r.left = frame->crop_left; ++ q->sel.r.top = frame->crop_top; ++ ++ return 0; ++} ++ ++ ++static int set_dst_format(DeintV4L2M2MContext * const priv, V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height) ++{ ++ struct v4l2_format * const fmt = &queue->format; ++ struct v4l2_selection *const sel = &queue->sel; ++ ++ memset(&fmt->fmt, 0, sizeof(fmt->fmt)); ++ ++ // Align w/h to 16 here in case there are alignment requirements at the next ++ // stage of the filter chain (also RPi deinterlace setup is bust and this ++ // fixes it) ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ fmt->fmt.pix_mp.pixelformat = pixelformat; ++ fmt->fmt.pix_mp.field = field; ++ fmt->fmt.pix_mp.width = FFALIGN(width, 16); ++ fmt->fmt.pix_mp.height = FFALIGN(height, 16); ++ } else { ++ fmt->fmt.pix.pixelformat = pixelformat; ++ fmt->fmt.pix.field = field; ++ fmt->fmt.pix.width = FFALIGN(width, 16); ++ fmt->fmt.pix.height = FFALIGN(height, 16); ++ } ++ ++ set_fmt_color(fmt, priv->colour_primaries, priv->colour_matrix, priv->colour_transfer); ++ set_fmt_color_range(fmt, priv->colour_range); ++ ++ sel->r.width = width; ++ sel->r.height = height; ++ sel->r.left = 0; ++ sel->r.top = 0; ++ ++ return do_s_fmt(queue); ++} ++ ++static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node) ++{ ++ int ret; ++ ++ ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0); ++ if (ctx->fd < 0) ++ return AVERROR(errno); ++ ++ ret = deint_v4l2m2m_prepare_context(ctx); ++ if (ret) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to prepare context\n"); ++ goto fail; ++ } ++ ++ ret = deint_v4l2m2m_try_format(&ctx->capture, ctx->output_width, ctx->output_height, ctx->output_format); ++ if (ret) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try dst format\n"); ++ goto fail; ++ } ++ ++ ret = deint_v4l2m2m_try_format(&ctx->output, ctx->width, ctx->height, AV_PIX_FMT_NONE); ++ if (ret) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try src format\n"); ++ goto fail; ++ } ++ ++ return 0; ++ ++fail: ++ close(ctx->fd); ++ ctx->fd = -1; ++ ++ return ret; ++} ++ ++static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx) ++{ ++ int ret = AVERROR(EINVAL); ++ struct dirent *entry; ++ char node[PATH_MAX]; ++ DIR *dirp; ++ ++ dirp = opendir("/dev"); ++ if (!dirp) ++ return AVERROR(errno); ++ ++ for (entry = readdir(dirp); entry; entry = readdir(dirp)) { ++ ++ if (strncmp(entry->d_name, "video", 5)) ++ continue; ++ ++ snprintf(node, sizeof(node), "/dev/%s", entry->d_name); ++ av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node); ++ ret = deint_v4l2m2m_probe_device(ctx, node); ++ if (!ret) ++ break; ++ } ++ ++ closedir(dirp); ++ ++ if (ret) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n"); ++ ctx->fd = -1; ++ ++ return ret; ++ } ++ ++ av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node); ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf) ++{ ++ int ret; ++ ++ ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ buf->enqueued = 1; ++ ++ return 0; ++} ++ ++static void ++drm_frame_init(AVDRMFrameDescriptor * const d) ++{ ++ unsigned int i; ++ for (i = 0; i != AV_DRM_MAX_PLANES; ++i) { ++ d->objects[i].fd = -1; ++ } ++} ++ ++static void ++drm_frame_uninit(AVDRMFrameDescriptor * const d) ++{ ++ unsigned int i; ++ for (i = 0; i != d->nb_objects; ++i) { ++ if (d->objects[i].fd != -1) { ++ close(d->objects[i].fd); ++ d->objects[i].fd = -1; ++ } ++ } ++} ++ ++static void ++avbufs_delete(V4L2Buffer** ppavbufs, const unsigned int n) ++{ ++ unsigned int i; ++ V4L2Buffer* const avbufs = *ppavbufs; ++ ++ if (avbufs == NULL) ++ return; ++ *ppavbufs = NULL; ++ ++ for (i = 0; i != n; ++i) { ++ V4L2Buffer* const avbuf = avbufs + i; ++ drm_frame_uninit(&avbuf->drm_frame); ++ } ++ ++ av_free(avbufs); ++} ++ ++static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf) ++{ ++ struct v4l2_exportbuffer expbuf; ++ int i, ret; ++ uint64_t mod = DRM_FORMAT_MOD_LINEAR; ++ ++ AVDRMFrameDescriptor * const drm_desc = &avbuf->drm_frame; ++ AVDRMLayerDescriptor * const layer = &drm_desc->layers[0]; ++ const struct v4l2_format *const fmt = &q->format; ++ const uint32_t height = fmt_height(fmt); ++ ptrdiff_t bpl0; ++ ++ /* fill the DRM frame descriptor */ ++ drm_desc->nb_layers = 1; ++ layer->nb_planes = avbuf->num_planes; ++ ++ for (int i = 0; i < avbuf->num_planes; i++) { ++ layer->planes[i].object_index = i; ++ layer->planes[i].offset = 0; ++ layer->planes[i].pitch = fmt_bpl(fmt, i); ++ } ++ bpl0 = layer->planes[0].pitch; ++ ++ switch (fmt_pixelformat(fmt)) { ++#if CONFIG_SAND ++ case V4L2_PIX_FMT_NV12_COL128: ++ mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0); ++ layer->format = V4L2_PIX_FMT_NV12; ++ ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 2; ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = height * 128; ++ layer->planes[0].pitch = fmt_width(fmt); ++ layer->planes[1].pitch = layer->planes[0].pitch; ++ break; ++#endif ++ ++ case DRM_FORMAT_NV12: ++ layer->format = V4L2_PIX_FMT_NV12; ++ ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 2; ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = bpl0 * height; ++ layer->planes[1].pitch = bpl0; ++ break; ++ ++ case V4L2_PIX_FMT_YUV420: ++ layer->format = DRM_FORMAT_YUV420; ++ ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 3; ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = bpl0 * height; ++ layer->planes[1].pitch = bpl0 / 2; ++ layer->planes[2].object_index = 0; ++ layer->planes[2].offset = layer->planes[1].offset + ((bpl0 * height) / 4); ++ layer->planes[2].pitch = bpl0 / 2; ++ break; ++ ++ default: ++ drm_desc->nb_layers = 0; ++ return AVERROR(EINVAL); ++ } ++ ++ drm_desc->nb_objects = 0; ++ for (i = 0; i < avbuf->num_planes; i++) { ++ memset(&expbuf, 0, sizeof(expbuf)); ++ ++ expbuf.index = avbuf->buffer.index; ++ expbuf.type = avbuf->buffer.type; ++ expbuf.plane = i; ++ ++ ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ drm_desc->objects[i].size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type) ? ++ avbuf->buffer.m.planes[i].length : avbuf->buffer.length; ++ drm_desc->objects[i].fd = expbuf.fd; ++ drm_desc->objects[i].format_modifier = mod; ++ drm_desc->nb_objects = i + 1; ++ } ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) ++{ ++ struct v4l2_format *fmt = &queue->format; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ struct v4l2_requestbuffers req; ++ int ret, i, multiplanar; ++ uint32_t memory; ++ ++ memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ? ++ V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; ++ ++ multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type); ++ ++ memset(&req, 0, sizeof(req)); ++ req.count = queue->num_buffers; ++ req.memory = memory; ++ req.type = fmt->type; ++ ++ ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req); ++ if (ret < 0) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno)); ++ ++ return AVERROR(errno); ++ } ++ ++ queue->num_buffers = req.count; ++ queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer)); ++ if (!queue->buffers) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n"); ++ ++ return AVERROR(ENOMEM); ++ } ++ ++ for (i = 0; i < queue->num_buffers; i++) { ++ V4L2Buffer * const buf = &queue->buffers[i]; ++ ++ buf->enqueued = 0; ++ buf->q = queue; ++ ++ buf->buffer.type = fmt->type; ++ buf->buffer.memory = memory; ++ buf->buffer.index = i; ++ ++ if (multiplanar) { ++ buf->buffer.length = VIDEO_MAX_PLANES; ++ buf->buffer.m.planes = buf->planes; ++ } ++ ++ drm_frame_init(&buf->drm_frame); ++ } ++ ++ for (i = 0; i < queue->num_buffers; i++) { ++ V4L2Buffer * const buf = &queue->buffers[i]; ++ ++ ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer); ++ if (ret < 0) { ++ ret = AVERROR(errno); ++ ++ goto fail; ++ } ++ ++ buf->num_planes = multiplanar ? buf->buffer.length : 1; ++ ++ if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) { ++ ret = deint_v4l2m2m_enqueue_buffer(buf); ++ if (ret) ++ goto fail; ++ ++ ret = v4l2_buffer_export_drm(queue, buf); ++ if (ret) ++ goto fail; ++ } ++ } ++ ++ return 0; ++ ++fail: ++ avbufs_delete(&queue->buffers, queue->num_buffers); ++ queue->num_buffers = 0; ++ return ret; ++} ++ ++static int deint_v4l2m2m_streamon(V4L2Queue *queue) ++{ ++ DeintV4L2M2MContextShared * const ctx = queue->ctx; ++ int type = queue->format.type; ++ int ret; ++ ++ ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type); ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno)); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_streamoff(V4L2Queue *queue) ++{ ++ DeintV4L2M2MContextShared * const ctx = queue->ctx; ++ int type = queue->format.type; ++ int ret; ++ ++ ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type); ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno)); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ return 0; ++} ++ ++// timeout in ms ++static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout) ++{ ++ struct v4l2_plane planes[VIDEO_MAX_PLANES]; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ struct v4l2_buffer buf = { 0 }; ++ V4L2Buffer* avbuf = NULL; ++ struct pollfd pfd; ++ short events; ++ int ret; ++ ++ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) ++ events = POLLOUT | POLLWRNORM; ++ else ++ events = POLLIN | POLLRDNORM; ++ ++ pfd.events = events; ++ pfd.fd = ctx->fd; ++ ++ for (;;) { ++ ret = poll(&pfd, 1, timeout); ++ if (ret > 0) ++ break; ++ if (errno == EINTR) ++ continue; ++ return NULL; ++ } ++ ++ if (pfd.revents & POLLERR) ++ return NULL; ++ ++ if (pfd.revents & events) { ++ memset(&buf, 0, sizeof(buf)); ++ buf.memory = V4L2_MEMORY_MMAP; ++ buf.type = queue->format.type; ++ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { ++ memset(planes, 0, sizeof(planes)); ++ buf.length = VIDEO_MAX_PLANES; ++ buf.m.planes = planes; ++ } ++ ++ ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf); ++ if (ret) { ++ if (errno != EAGAIN) ++ av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n", ++ av_err2str(AVERROR(errno))); ++ return NULL; ++ } ++ ++ avbuf = &queue->buffers[buf.index]; ++ avbuf->enqueued = 0; ++ avbuf->buffer = buf; ++ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { ++ memcpy(avbuf->planes, planes, sizeof(planes)); ++ avbuf->buffer.m.planes = avbuf->planes; ++ } ++ return avbuf; ++ } ++ ++ return NULL; ++} ++ ++static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue) ++{ ++ int i; ++ V4L2Buffer *buf = NULL; ++ ++ for (i = 0; i < queue->num_buffers; i++) ++ if (!queue->buffers[i].enqueued) { ++ buf = &queue->buffers[i]; ++ break; ++ } ++ return buf; ++} ++ ++static void deint_v4l2m2m_unref_queued(V4L2Queue *queue) ++{ ++ int i; ++ V4L2Buffer *buf = NULL; ++ ++ if (!queue || !queue->buffers) ++ return; ++ for (i = 0; i < queue->num_buffers; i++) { ++ buf = &queue->buffers[i]; ++ if (queue->buffers[i].enqueued) ++ av_frame_unref(&buf->frame); ++ } ++} ++ ++static void recycle_q(V4L2Queue * const queue) ++{ ++ V4L2Buffer* avbuf; ++ while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) { ++ av_frame_unref(&avbuf->frame); ++ } ++} ++ ++static int count_enqueued(V4L2Queue *queue) ++{ ++ int i; ++ int n = 0; ++ ++ if (queue->buffers == NULL) ++ return 0; ++ ++ for (i = 0; i < queue->num_buffers; i++) ++ if (queue->buffers[i].enqueued) ++ ++n; ++ return n; ++} ++ ++static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame) ++{ ++ DeintV4L2M2MContextShared *const ctx = queue->ctx; ++ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0]; ++ V4L2Buffer *buf; ++ int i; ++ ++ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) ++ recycle_q(queue); ++ ++ buf = deint_v4l2m2m_find_free_buf(queue); ++ if (!buf) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0); ++ return AVERROR(EAGAIN); ++ } ++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) ++ for (i = 0; i < drm_desc->nb_objects; i++) ++ buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd; ++ else ++ buf->buffer.m.fd = drm_desc->objects[0].fd; ++ ++ buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE : ++ frame->top_field_first ? V4L2_FIELD_INTERLACED_TB : ++ V4L2_FIELD_INTERLACED_BT; ++ ++ if (ctx->field_order != buf->buffer.field) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field); ++ ctx->field_order = buf->buffer.field; ++ } ++ ++ buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame); ++ ++ buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd; ++ ++ av_frame_move_ref(&buf->frame, frame); ++ ++ return deint_v4l2m2m_enqueue_buffer(buf); ++} ++ ++static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) ++{ ++ if (atomic_fetch_sub(&ctx->refcount, 1) == 1) { ++ V4L2Queue *capture = &ctx->capture; ++ V4L2Queue *output = &ctx->output; ++ ++ av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__); ++ ++ if (ctx->fd >= 0) { ++ deint_v4l2m2m_streamoff(capture); ++ deint_v4l2m2m_streamoff(output); ++ } ++ ++ avbufs_delete(&capture->buffers, capture->num_buffers); ++ ++ deint_v4l2m2m_unref_queued(output); ++ ++ av_buffer_unref(&ctx->hw_frames_ctx); ++ ++ if (capture->buffers) ++ av_free(capture->buffers); ++ ++ if (output->buffers) ++ av_free(output->buffers); ++ ++ if (ctx->fd >= 0) { ++ close(ctx->fd); ++ ctx->fd = -1; ++ } ++ ++ av_free(ctx); ++ } ++} ++ ++static void v4l2_free_buffer(void *opaque, uint8_t *unused) ++{ ++ V4L2Buffer *buf = opaque; ++ DeintV4L2M2MContextShared *ctx = buf->q->ctx; ++ ++ if (!ctx->done) ++ deint_v4l2m2m_enqueue_buffer(buf); ++ ++ deint_v4l2m2m_destroy_context(ctx); ++} ++ ++// timeout in ms ++static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout) ++{ ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ V4L2Buffer* avbuf; ++ enum AVColorPrimaries color_primaries; ++ enum AVColorSpace colorspace; ++ enum AVColorTransferCharacteristic color_trc; ++ enum AVColorRange color_range; ++ ++ av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ ++ if (queue->eos) { ++ av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: EOS\n", __func__); ++ return AVERROR_EOF; ++ } ++ ++ avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout); ++ if (!avbuf) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout); ++ return AVERROR(EAGAIN); ++ } ++ ++ if (V4L2_TYPE_IS_CAPTURE(avbuf->buffer.type)) { ++ if ((avbuf->buffer.flags & V4L2_BUF_FLAG_LAST) != 0) ++ queue->eos = 1; ++ if (buf_bytesused0(&avbuf->buffer) == 0) ++ return queue->eos ? AVERROR_EOF : AVERROR(EINVAL); ++ } ++ ++ // Fill in PTS and anciliary info from src frame ++ pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame); ++ ++ frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame, ++ sizeof(avbuf->drm_frame), v4l2_free_buffer, ++ avbuf, AV_BUFFER_FLAG_READONLY); ++ if (!frame->buf[0]) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0); ++ return AVERROR(ENOMEM); ++ } ++ ++ atomic_fetch_add(&ctx->refcount, 1); ++ ++ frame->data[0] = (uint8_t *)&avbuf->drm_frame; ++ frame->format = AV_PIX_FMT_DRM_PRIME; ++ if (ctx->hw_frames_ctx) ++ frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx); ++ frame->height = ctx->output_height; ++ frame->width = ctx->output_width; ++ ++ color_primaries = get_color_primaries(&ctx->capture.format); ++ colorspace = get_color_space(&ctx->capture.format); ++ color_trc = get_color_trc(&ctx->capture.format); ++ color_range = get_color_range(&ctx->capture.format); ++ ++ // If the color parameters are unspecified by V4L2 then leave alone as they ++ // will have been copied from src ++ if (color_primaries != AVCOL_PRI_UNSPECIFIED) ++ frame->color_primaries = color_primaries; ++ if (colorspace != AVCOL_SPC_UNSPECIFIED) ++ frame->colorspace = colorspace; ++ if (color_trc != AVCOL_TRC_UNSPECIFIED) ++ frame->color_trc = color_trc; ++ if (color_range != AVCOL_RANGE_UNSPECIFIED) ++ frame->color_range = color_range; ++ ++ if (ctx->filter_type == FILTER_V4L2_DEINTERLACE) { ++ // Not interlaced now ++ frame->interlaced_frame = 0; // *** Fill in from dst buffer? ++ frame->top_field_first = 0; ++ // Pkt duration halved ++ frame->pkt_duration /= 2; ++ } ++ ++ if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n"); ++ frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM; ++ } ++ ++ av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts); ++ return 0; ++} ++ ++static int deint_v4l2m2m_config_props(AVFilterLink *outlink) ++{ ++ AVFilterLink *inlink = outlink->src->inputs[0]; ++ AVFilterContext *avctx = outlink->src; ++ DeintV4L2M2MContext *priv = avctx->priv; ++ DeintV4L2M2MContextShared *ctx = priv->shared; ++ int ret; ++ ++ ctx->height = avctx->inputs[0]->h; ++ ctx->width = avctx->inputs[0]->w; ++ ++ if (ctx->filter_type == FILTER_V4L2_SCALE) { ++ if ((ret = ff_scale_eval_dimensions(priv, ++ priv->w_expr, priv->h_expr, ++ inlink, outlink, ++ &ctx->output_width, &ctx->output_height)) < 0) ++ return ret; ++ ++ ff_scale_adjust_dimensions(inlink, &ctx->output_width, &ctx->output_height, ++ priv->force_original_aspect_ratio, priv->force_divisible_by); ++ } ++ else { ++ ctx->output_width = ctx->width; ++ ctx->output_height = ctx->height; ++ } ++ ++ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d FR: %d/%d->%d/%d\n", __func__, ++ ctx->width, ctx->height, ctx->output_width, ctx->output_height, ++ inlink->frame_rate.num, inlink->frame_rate.den, outlink->frame_rate.num, outlink->frame_rate.den); ++ ++ outlink->time_base = inlink->time_base; ++ outlink->w = ctx->output_width; ++ outlink->h = ctx->output_height; ++ outlink->format = inlink->format; ++ if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && inlink->frame_rate.den != 0) ++ outlink->frame_rate = (AVRational){inlink->frame_rate.num * 2, inlink->frame_rate.den}; ++ ++ if (inlink->sample_aspect_ratio.num) ++ outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio); ++ else ++ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; ++ ++ ret = deint_v4l2m2m_find_device(ctx); ++ if (ret) ++ return ret; ++ ++ if (inlink->hw_frames_ctx) { ++ ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx); ++ if (!ctx->hw_frames_ctx) ++ return AVERROR(ENOMEM); ++ } ++ return 0; ++} ++ ++static int deint_v4l2m2m_query_formats(AVFilterContext *avctx) ++{ ++ static const enum AVPixelFormat pixel_formats[] = { ++ AV_PIX_FMT_DRM_PRIME, ++// AV_PIX_FMT_YUV420P, ++ AV_PIX_FMT_NONE, ++ }; ++ ++ return ff_set_common_formats(avctx, ff_make_format_list(pixel_formats)); ++} ++ ++static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc) ++{ ++ const uint64_t mod = drm_desc->objects[0].format_modifier; ++ const int is_linear = (mod == DRM_FORMAT_MOD_LINEAR || mod == DRM_FORMAT_MOD_INVALID); ++ ++ // Only currently support single object things ++ if (drm_desc->nb_objects != 1) ++ return 0; ++ ++ switch (drm_desc->layers[0].format) { ++ case DRM_FORMAT_YUV420: ++ return is_linear ? V4L2_PIX_FMT_YUV420 : 0; ++ case DRM_FORMAT_NV12: ++ return is_linear ? V4L2_PIX_FMT_NV12 : ++#if CONFIG_SAND ++ fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 : ++#endif ++ 0; ++ default: ++ break; ++ } ++ return 0; ++} ++ ++static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) ++{ ++ AVFilterContext *avctx = link->dst; ++ DeintV4L2M2MContext *priv = avctx->priv; ++ DeintV4L2M2MContextShared *ctx = priv->shared; ++ V4L2Queue *capture = &ctx->capture; ++ V4L2Queue *output = &ctx->output; ++ int ret; ++ ++ av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" dts: %"PRId64" field :%d interlaced: %d aspect:%d/%d\n", ++ __func__, in->pts, in->pkt_dts, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den); ++ av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__, ++ avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out); ++ ++ if (ctx->field_order == V4L2_FIELD_ANY) { ++ const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0]; ++ uint32_t pixelformat = desc_pixelformat(drm_desc); ++ ++ if (pixelformat == 0) { ++ av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n", ++ av_fourcc2str(drm_desc->layers[0].format), ++ drm_desc->nb_objects, drm_desc->objects[0].format_modifier); ++ return AVERROR(EINVAL); ++ } ++ ++ ctx->orig_width = drm_desc->layers[0].planes[0].pitch; ++ ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width; ++ ++ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height, ++ drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset); ++ ++ if ((ret = set_src_fmt(output, in)) != 0) { ++ av_log(avctx, AV_LOG_WARNING, "Unknown input DRM format: %s mod: %#" PRIx64 "\n", ++ av_fourcc2str(drm_desc->layers[0].format), drm_desc->objects[0].format_modifier); ++ return ret; ++ } ++ ++ ret = do_s_fmt(output); ++ if (ret) { ++ av_log(avctx, AV_LOG_WARNING, "Failed to set source format\n"); ++ return ret; ++ } ++ ++ if (ctx->output_format != AV_PIX_FMT_NONE) ++ pixelformat = fmt_av_to_v4l2(ctx->output_format); ++ ret = set_dst_format(priv, capture, pixelformat, V4L2_FIELD_NONE, ctx->output_width, ctx->output_height); ++ if (ret) { ++ av_log(avctx, AV_LOG_WARNING, "Failed to set destination format\n"); ++ return ret; ++ } ++ ++ ret = deint_v4l2m2m_allocate_buffers(capture); ++ if (ret) { ++ av_log(avctx, AV_LOG_WARNING, "Failed to allocate destination buffers\n"); ++ return ret; ++ } ++ ++ ret = deint_v4l2m2m_streamon(capture); ++ if (ret) { ++ av_log(avctx, AV_LOG_WARNING, "Failed set destination streamon: %s\n", av_err2str(ret)); ++ return ret; ++ } ++ ++ ret = deint_v4l2m2m_allocate_buffers(output); ++ if (ret) { ++ av_log(avctx, AV_LOG_WARNING, "Failed to allocate src buffers\n"); ++ return ret; ++ } ++ ++ ret = deint_v4l2m2m_streamon(output); ++ if (ret) { ++ av_log(avctx, AV_LOG_WARNING, "Failed set src streamon: %s\n", av_err2str(ret)); ++ return ret; ++ } ++ ++ if (in->top_field_first) ++ ctx->field_order = V4L2_FIELD_INTERLACED_TB; ++ else ++ ctx->field_order = V4L2_FIELD_INTERLACED_BT; ++ ++ { ++ struct v4l2_encoder_cmd ecmd = { ++ .cmd = V4L2_ENC_CMD_STOP ++ }; ++ ctx->has_enc_stop = 0; ++ if (ioctl(ctx->fd, VIDIOC_TRY_ENCODER_CMD, &ecmd) == 0) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop succeeded\n"); ++ ctx->has_enc_stop = 1; ++ } ++ else { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop fail: %s\n", av_err2str(AVERROR(errno))); ++ } ++ ++ } ++ } ++ ++ ret = deint_v4l2m2m_enqueue_frame(output, in); ++ ++ av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret)); ++ return ret; ++} ++ ++static int ++ack_inlink(AVFilterContext * const avctx, DeintV4L2M2MContextShared *const s, ++ AVFilterLink * const inlink) ++{ ++ int instatus; ++ int64_t inpts; ++ ++ if (ff_inlink_acknowledge_status(inlink, &instatus, &inpts) <= 0) ++ return 0; ++ ++ s->drain = instatus; ++ s->drain_pts = inpts; ++ s->drain_state = DRAIN_TIMEOUT; ++ ++ if (s->field_order == V4L2_FIELD_ANY) { // Not yet started ++ s->drain_state = DRAIN_DONE; ++ } ++ else if (s->one_to_one) { ++ s->drain_state = DRAIN_LAST; ++ } ++ else if (s->has_enc_stop) { ++ struct v4l2_encoder_cmd ecmd = { ++ .cmd = V4L2_ENC_CMD_STOP ++ }; ++ if (ioctl(s->fd, VIDIOC_ENCODER_CMD, &ecmd) == 0) { ++ av_log(avctx->priv, AV_LOG_DEBUG, "Do Encode stop\n"); ++ s->drain_state = DRAIN_EOS; ++ } ++ else { ++ av_log(avctx->priv, AV_LOG_WARNING, "Encode stop fail: %s\n", av_err2str(AVERROR(errno))); ++ } ++ } ++ return 1; ++} ++ ++static int deint_v4l2m2m_activate(AVFilterContext *avctx) ++{ ++ DeintV4L2M2MContext * const priv = avctx->priv; ++ DeintV4L2M2MContextShared *const s = priv->shared; ++ AVFilterLink * const outlink = avctx->outputs[0]; ++ AVFilterLink * const inlink = avctx->inputs[0]; ++ int n = 0; ++ int cn = 99; ++ int did_something = 0; ++ ++ av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__); ++ ++ FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx); ++ ++ ack_inlink(avctx, s, inlink); ++ ++ if (s->field_order != V4L2_FIELD_ANY) // Can't DQ if no setup! ++ { ++ AVFrame * frame = av_frame_alloc(); ++ int rv; ++ ++ recycle_q(&s->output); ++ n = count_enqueued(&s->output); ++ ++ if (frame == NULL) { ++ av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__); ++ return AVERROR(ENOMEM); ++ } ++ ++ rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, ++ drain_frame_expected(s->drain_state) || n > 4 ? 300 : 0); ++ if (rv != 0) { ++ av_frame_free(&frame); ++ if (rv == AVERROR_EOF) { ++ av_log(priv, AV_LOG_DEBUG, "%s: --- DQ EOF\n", __func__); ++ s->drain_state = DRAIN_DONE; ++ } ++ else if (rv == AVERROR(EAGAIN)) { ++ if (s->drain_state != DRAIN_NONE) { ++ av_log(priv, AV_LOG_DEBUG, "%s: --- DQ empty - drain done\n", __func__); ++ s->drain_state = DRAIN_DONE; ++ } ++ } ++ else { ++ av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv)); ++ return rv; ++ } ++ } ++ else { ++ frame->interlaced_frame = 0; ++ // frame is always consumed by filter_frame - even on error despite ++ // a somewhat confusing comment in the header ++ rv = ff_filter_frame(outlink, frame); ++ ++s->frames_tx; ++ ++ av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv)); ++ did_something = 1; ++ ++ if (s->drain_state != DRAIN_NONE && pts_track_count(&s->track) == 0) { ++ av_log(priv, AV_LOG_DEBUG, "%s: --- DQ last - drain done\n", __func__); ++ s->drain_state = DRAIN_DONE; ++ } ++ } ++ ++ cn = count_enqueued(&s->capture); ++ } ++ ++ if (s->drain_state == DRAIN_DONE) { ++ ff_outlink_set_status(outlink, s->drain, s->drain_pts); ++ av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(s->drain)); ++ return 0; ++ } ++ ++ recycle_q(&s->output); ++ n = count_enqueued(&s->output); ++ ++ while (n < 6 && !s->drain) { ++ AVFrame * frame; ++ int rv; ++ ++ if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) { ++ av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv)); ++ return rv; ++ } ++ ++ if (frame == NULL) { ++ av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__); ++ if (!ack_inlink(avctx, s, inlink)) { ++ ff_inlink_request_frame(inlink); ++ av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__); ++ } ++ break; ++ } ++ ++s->frames_rx; ++ ++ rv = deint_v4l2m2m_filter_frame(inlink, frame); ++ av_frame_free(&frame); ++ ++ if (rv != 0) ++ return rv; ++ ++ av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__); ++ did_something = 1; ++ ++n; ++ } ++ ++ if ((n > 4 || s->drain) && ff_outlink_frame_wanted(outlink)) { ++ ff_filter_set_ready(avctx, 1); ++ did_something = 1; ++ av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__); ++ } ++ ++ av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn); ++ return did_something ? 0 : FFERROR_NOT_READY; ++} ++ ++static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filter_type_v4l2_t filter_type) ++{ ++ DeintV4L2M2MContext * const priv = avctx->priv; ++ DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared)); ++ ++ if (!ctx) { ++ av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0); ++ return AVERROR(ENOMEM); ++ } ++ priv->shared = ctx; ++ ctx->logctx = priv; ++ ctx->filter_type = filter_type; ++ ctx->fd = -1; ++ ctx->output.ctx = ctx; ++ ctx->output.num_buffers = 8; ++ ctx->output.name = "OUTPUT"; ++ ctx->capture.ctx = ctx; ++ ctx->capture.num_buffers = 12; ++ ctx->capture.name = "CAPTURE"; ++ ctx->done = 0; ++ ctx->field_order = V4L2_FIELD_ANY; ++ ++ pts_track_init(&ctx->track, priv); ++ ++ atomic_init(&ctx->refcount, 1); ++ ++ if (priv->output_format_string) { ++ ctx->output_format = av_get_pix_fmt(priv->output_format_string); ++ if (ctx->output_format == AV_PIX_FMT_NONE) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid ffmpeg output format '%s'.\n", priv->output_format_string); ++ return AVERROR(EINVAL); ++ } ++ if (fmt_av_to_v4l2(ctx->output_format) == 0) { ++ av_log(avctx, AV_LOG_ERROR, "Unsupported output format for V4L2: %s.\n", av_get_pix_fmt_name(ctx->output_format)); ++ return AVERROR(EINVAL); ++ } ++ } else { ++ // Use the input format once that is configured. ++ ctx->output_format = AV_PIX_FMT_NONE; ++ } ++ ++#define STRING_OPTION(var_name, func_name, default_value) do { \ ++ if (priv->var_name ## _string) { \ ++ int var = av_ ## func_name ## _from_name(priv->var_name ## _string); \ ++ if (var < 0) { \ ++ av_log(avctx, AV_LOG_ERROR, "Invalid %s.\n", #var_name); \ ++ return AVERROR(EINVAL); \ ++ } \ ++ priv->var_name = var; \ ++ } else { \ ++ priv->var_name = default_value; \ ++ } \ ++ } while (0) ++ ++ STRING_OPTION(colour_primaries, color_primaries, AVCOL_PRI_UNSPECIFIED); ++ STRING_OPTION(colour_transfer, color_transfer, AVCOL_TRC_UNSPECIFIED); ++ STRING_OPTION(colour_matrix, color_space, AVCOL_SPC_UNSPECIFIED); ++ STRING_OPTION(chroma_location, chroma_location, AVCHROMA_LOC_UNSPECIFIED); ++ ++ return 0; ++} ++ ++static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) ++{ ++ return common_v4l2m2m_init(avctx, FILTER_V4L2_DEINTERLACE); ++} ++ ++static av_cold int scale_v4l2m2m_init(AVFilterContext *avctx) ++{ ++ int rv; ++ DeintV4L2M2MContext * priv; ++ DeintV4L2M2MContextShared * ctx; ++ ++ if ((rv = common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE)) != 0) ++ return rv; ++ ++ priv = avctx->priv; ++ ctx = priv->shared; ++ ++ ctx->one_to_one = 1; ++ return 0; ++} ++ ++static void deint_v4l2m2m_uninit(AVFilterContext *avctx) ++{ ++ DeintV4L2M2MContext *priv = avctx->priv; ++ DeintV4L2M2MContextShared *ctx = priv->shared; ++ ++ av_log(priv, AV_LOG_VERBOSE, "Frames Rx: %u, Frames Tx: %u\n", ++ ctx->frames_rx, ctx->frames_tx); ++ ctx->done = 1; ++ ctx->logctx = NULL; // Log to NULL works, log to missing crashes ++ pts_track_uninit(&ctx->track); ++ deint_v4l2m2m_destroy_context(ctx); ++} ++ ++static const AVOption deinterlace_v4l2m2m_options[] = { ++ { NULL }, ++}; ++ ++AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m); ++ ++#define OFFSET(x) offsetof(DeintV4L2M2MContext, x) ++#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM) ++ ++static const AVOption scale_v4l2m2m_options[] = { ++ { "w", "Output video width", ++ OFFSET(w_expr), AV_OPT_TYPE_STRING, {.str = "iw"}, .flags = FLAGS }, ++ { "h", "Output video height", ++ OFFSET(h_expr), AV_OPT_TYPE_STRING, {.str = "ih"}, .flags = FLAGS }, ++ { "format", "Output video format (software format of hardware frames)", ++ OFFSET(output_format_string), AV_OPT_TYPE_STRING, .flags = FLAGS }, ++ // These colour properties match the ones of the same name in vf_scale. ++ { "out_color_matrix", "Output colour matrix coefficient set", ++ OFFSET(colour_matrix_string), AV_OPT_TYPE_STRING, { .str = NULL }, .flags = FLAGS }, ++ { "out_range", "Output colour range", ++ OFFSET(colour_range), AV_OPT_TYPE_INT, { .i64 = AVCOL_RANGE_UNSPECIFIED }, ++ AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_JPEG, FLAGS, "range" }, ++ { "full", "Full range", ++ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" }, ++ { "limited", "Limited range", ++ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" }, ++ { "jpeg", "Full range", ++ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" }, ++ { "mpeg", "Limited range", ++ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" }, ++ { "tv", "Limited range", ++ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" }, ++ { "pc", "Full range", ++ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" }, ++ // These colour properties match the ones in the VAAPI scaler ++ { "out_color_primaries", "Output colour primaries", ++ OFFSET(colour_primaries_string), AV_OPT_TYPE_STRING, ++ { .str = NULL }, .flags = FLAGS }, ++ { "out_color_transfer", "Output colour transfer characteristics", ++ OFFSET(colour_transfer_string), AV_OPT_TYPE_STRING, ++ { .str = NULL }, .flags = FLAGS }, ++ { "out_chroma_location", "Output chroma sample location", ++ OFFSET(chroma_location_string), AV_OPT_TYPE_STRING, ++ { .str = NULL }, .flags = FLAGS }, ++ { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" }, ++ { "force_divisible_by", "enforce that the output resolution is divisible by a defined integer when force_original_aspect_ratio is used", OFFSET(force_divisible_by), AV_OPT_TYPE_INT, { .i64 = 1}, 1, 256, FLAGS }, ++ { NULL }, ++}; ++ ++AVFILTER_DEFINE_CLASS(scale_v4l2m2m); ++ ++static const AVFilterPad deint_v4l2m2m_inputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO, ++ }, ++ { NULL } ++}; ++ ++static const AVFilterPad deint_v4l2m2m_outputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .config_props = deint_v4l2m2m_config_props, ++ }, ++ { NULL } ++}; ++ ++AVFilter ff_vf_deinterlace_v4l2m2m = { ++ .name = "deinterlace_v4l2m2m", ++ .description = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"), ++ .priv_size = sizeof(DeintV4L2M2MContext), ++ .init = &deint_v4l2m2m_init, ++ .uninit = &deint_v4l2m2m_uninit, ++ .query_formats = &deint_v4l2m2m_query_formats, ++ .inputs = deint_v4l2m2m_inputs, ++ .outputs = deint_v4l2m2m_outputs, ++ .priv_class = &deinterlace_v4l2m2m_class, ++ .activate = deint_v4l2m2m_activate, ++}; ++ ++AVFilter ff_vf_scale_v4l2m2m = { ++ .name = "scale_v4l2m2m", ++ .description = NULL_IF_CONFIG_SMALL("V4L2 M2M scaler"), ++ .priv_size = sizeof(DeintV4L2M2MContext), ++ .init = &scale_v4l2m2m_init, ++ .uninit = &deint_v4l2m2m_uninit, ++ .query_formats = &deint_v4l2m2m_query_formats, ++ .inputs = deint_v4l2m2m_inputs, ++ .outputs = deint_v4l2m2m_outputs, ++ .priv_class = &scale_v4l2m2m_class, ++ .activate = deint_v4l2m2m_activate, ++}; ++ +diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c +new file mode 100644 +index 0000000000..61c03a385c +--- /dev/null ++++ b/libavfilter/vf_unsand.c +@@ -0,0 +1,229 @@ ++/* ++ * Copyright (c) 2007 Bobby Bingham ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/** ++ * @file ++ * format and noformat video filters ++ */ ++ ++#include ++ ++#include "libavutil/internal.h" ++#include "libavutil/mem.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/opt.h" ++#include "libavutil/rpi_sand_fns.h" ++ ++#include "avfilter.h" ++#include "formats.h" ++#include "internal.h" ++#include "video.h" ++ ++typedef struct UnsandContext { ++ const AVClass *class; ++} UnsandContext; ++ ++static av_cold void uninit(AVFilterContext *ctx) ++{ ++// UnsandContext *s = ctx->priv; ++} ++ ++static av_cold int init(AVFilterContext *ctx) ++{ ++// UnsandContext *s = ctx->priv; ++ ++ return 0; ++} ++ ++ ++static int filter_frame(AVFilterLink *link, AVFrame *in) ++{ ++ AVFilterLink * const outlink = link->dst->outputs[0]; ++ AVFrame *out = NULL; ++ int rv = 0; ++ ++ if (outlink->format == in->format) { ++ // If nothing to do then do nothing ++ out = in; ++ } ++ else ++ { ++ if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL) ++ { ++ rv = AVERROR(ENOMEM); ++ goto fail; ++ } ++ if (av_rpi_sand_to_planar_frame(out, in) != 0) ++ { ++ rv = -1; ++ goto fail; ++ } ++ ++ av_frame_free(&in); ++ } ++ ++ return ff_filter_frame(outlink, out); ++ ++fail: ++ av_frame_free(&out); ++ av_frame_free(&in); ++ return rv; ++} ++ ++#if 0 ++static void dump_fmts(const AVFilterFormats * fmts) ++{ ++ int i; ++ if (fmts== NULL) { ++ printf("NULL\n"); ++ return; ++ } ++ for (i = 0; i < fmts->nb_formats; ++i) { ++ printf(" %d", fmts->formats[i]); ++ } ++ printf("\n"); ++} ++#endif ++ ++static int query_formats(AVFilterContext *ctx) ++{ ++// UnsandContext *s = ctx->priv; ++ int ret; ++ ++ // If we aren't connected at both ends then just do nothing ++ if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL) ++ return 0; ++ ++ // Our output formats depend on our input formats and we can't/don't ++ // want to convert between bit depths so we need to wait for the source ++ // to have an opinion before we do ++ if (ctx->inputs[0]->incfg.formats == NULL) ++ return AVERROR(EAGAIN); ++ ++ // Accept anything ++ if (ctx->inputs[0]->outcfg.formats == NULL && ++ (ret = ff_formats_ref(ctx->inputs[0]->incfg.formats, &ctx->inputs[0]->outcfg.formats)) < 0) ++ return ret; ++ ++ // Filter out sand formats ++ ++ // Generate a container if we don't already have one ++ if (ctx->outputs[0]->incfg.formats == NULL) ++ { ++ // Somewhat rubbish way of ensuring we have a good structure ++ const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE}; ++ AVFilterFormats *formats = ff_make_format_list(out_fmts); ++ ++ if (formats == NULL) ++ return AVERROR(ENOMEM); ++ if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0) ++ return ret; ++ } ++ ++ // Replace old format list with new filtered list derived from what our ++ // input says it can do ++ { ++ const AVFilterFormats * const src_ff = ctx->inputs[0]->outcfg.formats; ++ AVFilterFormats * const dst_ff = ctx->outputs[0]->incfg.formats; ++ enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats); ++ int i; ++ int n = 0; ++ int seen_420p = 0; ++ int seen_420p10 = 0; ++ ++ for (i = 0; i < src_ff->nb_formats; ++i) { ++ const enum AVPixelFormat f = src_ff->formats[i]; ++ ++ switch (f){ ++ case AV_PIX_FMT_YUV420P: ++ case AV_PIX_FMT_SAND128: ++ case AV_PIX_FMT_RPI4_8: ++ if (!seen_420p) { ++ seen_420p = 1; ++ dst_fmts[n++] = AV_PIX_FMT_YUV420P; ++ } ++ break; ++ case AV_PIX_FMT_SAND64_10: ++ case AV_PIX_FMT_YUV420P10: ++ case AV_PIX_FMT_RPI4_10: ++ if (!seen_420p10) { ++ seen_420p10 = 1; ++ dst_fmts[n++] = AV_PIX_FMT_YUV420P10; ++ } ++ break; ++ default: ++ dst_fmts[n++] = f; ++ break; ++ } ++ } ++ ++ av_freep(&dst_ff->formats); ++ dst_ff->formats = dst_fmts; ++ dst_ff->nb_formats = n; ++ } ++ ++// printf("Unsand: %s calc: ", __func__); ++// dump_fmts(ctx->outputs[0]->incfg.formats); ++ ++ return 0; ++} ++ ++ ++#define OFFSET(x) offsetof(UnsandContext, x) ++static const AVOption unsand_options[] = { ++ { NULL } ++}; ++ ++ ++AVFILTER_DEFINE_CLASS(unsand); ++ ++static const AVFilterPad avfilter_vf_unsand_inputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .filter_frame = filter_frame, ++ }, ++ { NULL } ++}; ++ ++static const AVFilterPad avfilter_vf_unsand_outputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO ++ }, ++ { NULL } ++}; ++ ++AVFilter ff_vf_unsand = { ++ .name = "unsand", ++ .description = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"), ++ ++ .init = init, ++ .uninit = uninit, ++ ++ .query_formats = query_formats, ++ ++ .priv_size = sizeof(UnsandContext), ++ .priv_class = &unsand_class, ++ ++ .inputs = avfilter_vf_unsand_inputs, ++ .outputs = avfilter_vf_unsand_outputs, ++}; ++ +diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c +index b1e70b3bc6..b9e3a25921 100644 +--- a/libavfilter/x86/vf_bwdif_init.c ++++ b/libavfilter/x86/vf_bwdif_init.c +@@ -51,11 +51,9 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); + +-av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif) ++av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth) + { +- YADIFContext *yadif = &bwdif->yadif; + int cpu_flags = av_get_cpu_flags(); +- int bit_depth = (!yadif->csp) ? 8 : yadif->csp->comp[0].depth; + + if (bit_depth <= 8) { + #if ARCH_X86_32 +diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c +index b4284a8778..692265593c 100644 +--- a/libavformat/matroskaenc.c ++++ b/libavformat/matroskaenc.c +@@ -58,6 +58,9 @@ + * Info, Tracks, Chapters, Attachments, Tags (potentially twice) and Cues */ + #define MAX_SEEKHEAD_ENTRIES 7 + ++/* Reserved size for H264 headers if not extant at init time */ ++#define MAX_H264_HEADER_SIZE 1024 ++ + #define IS_SEEKABLE(pb, mkv) (((pb)->seekable & AVIO_SEEKABLE_NORMAL) && \ + !(mkv)->is_live) + +@@ -721,8 +724,12 @@ static int mkv_write_native_codecprivate(AVFormatContext *s, AVIOContext *pb, + case AV_CODEC_ID_WAVPACK: + return put_wv_codecpriv(dyn_cp, par); + case AV_CODEC_ID_H264: +- return ff_isom_write_avcc(dyn_cp, par->extradata, +- par->extradata_size); ++ if (par->extradata_size) ++ return ff_isom_write_avcc(dyn_cp, par->extradata, ++ par->extradata_size); ++ else ++ put_ebml_void(pb, MAX_H264_HEADER_SIZE); ++ break; + case AV_CODEC_ID_HEVC: + return ff_isom_write_hvcc(dyn_cp, par->extradata, + par->extradata_size, 0); +@@ -2259,7 +2266,9 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt) + break; + // FIXME: Remove the following once libaom starts propagating extradata during init() + // See https://bugs.chromium.org/p/aomedia/issues/detail?id=2012 ++ // H264 V4L2 has a similar issue + case AV_CODEC_ID_AV1: ++ case AV_CODEC_ID_H264: + if (side_data_size && mkv->track.bc && !par->extradata_size) { + AVIOContext *dyn_cp; + uint8_t *codecpriv; +@@ -2267,7 +2276,10 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt) + ret = avio_open_dyn_buf(&dyn_cp); + if (ret < 0) + return ret; +- ff_isom_write_av1c(dyn_cp, side_data, side_data_size); ++ if (par->codec_id == AV_CODEC_ID_H264) ++ ff_isom_write_avcc(dyn_cp, side_data, side_data_size); ++ else ++ ff_isom_write_av1c(dyn_cp, side_data, side_data_size); + codecpriv_size = avio_get_dyn_buf(dyn_cp, &codecpriv); + if ((ret = dyn_cp->error) < 0 || + !codecpriv_size && (ret = AVERROR_INVALIDDATA)) { +@@ -2275,8 +2287,25 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt) + return ret; + } + avio_seek(mkv->track.bc, track->codecpriv_offset, SEEK_SET); +- // Do not write the OBUs as we don't have space saved for them +- put_ebml_binary(mkv->track.bc, MATROSKA_ID_CODECPRIVATE, codecpriv, 4); ++ if (par->codec_id == AV_CODEC_ID_H264) { ++ int filler; ++ // Up to 6 bytes for header and the filler must be at least 2 ++ if (codecpriv_size > MAX_H264_HEADER_SIZE - 8) { ++ av_log(s, AV_LOG_ERROR, "H264 header size %d > %d bytes\n", codecpriv_size, MAX_H264_HEADER_SIZE - 8); ++ return AVERROR_INVALIDDATA; ++ } ++ put_ebml_binary(mkv->track.bc, MATROSKA_ID_CODECPRIVATE, codecpriv, codecpriv_size); ++ filler = MAX_H264_HEADER_SIZE - (avio_tell(mkv->track.bc) - track->codecpriv_offset); ++ if (filler < 2) { ++ av_log(s, AV_LOG_ERROR, "Unexpected SPS/PPS filler length: %d\n", filler); ++ return AVERROR_BUG; ++ } ++ put_ebml_void(mkv->track.bc, filler); ++ } ++ else { ++ // Do not write the OBUs as we don't have space saved for them ++ put_ebml_binary(mkv->track.bc, MATROSKA_ID_CODECPRIVATE, codecpriv, 4); ++ } + ffio_free_dyn_buf(&dyn_cp); + ret = ff_alloc_extradata(par, side_data_size); + if (ret < 0) +diff --git a/libavformat/movenc.c b/libavformat/movenc.c +index 2cd5773dc5..0cbbc094de 100644 +--- a/libavformat/movenc.c ++++ b/libavformat/movenc.c +@@ -5926,6 +5926,7 @@ static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt) + if (trk->par->codec_id == AV_CODEC_ID_MP4ALS || + trk->par->codec_id == AV_CODEC_ID_AAC || + trk->par->codec_id == AV_CODEC_ID_AV1 || ++ trk->par->codec_id == AV_CODEC_ID_H264 || + trk->par->codec_id == AV_CODEC_ID_FLAC) { + buffer_size_t side_size; + uint8_t *side = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size); +diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c +index 38e4c65c4e..5e04c1df08 100644 +--- a/libavformat/rtpenc.c ++++ b/libavformat/rtpenc.c +@@ -19,6 +19,7 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + ++#include "avc.h" + #include "avformat.h" + #include "mpegts.h" + #include "internal.h" +@@ -582,8 +583,25 @@ static int rtp_write_packet(AVFormatContext *s1, AVPacket *pkt) + ff_rtp_send_vc2hq(s1, pkt->data, size, st->codecpar->field_order != AV_FIELD_PROGRESSIVE ? 1 : 0); + break; + case AV_CODEC_ID_H264: ++ { ++ uint8_t *side_data; ++ int side_data_size = 0; ++ ++ side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, ++ &side_data_size); ++ ++ if (side_data_size != 0) { ++ int ps_size = side_data_size; ++ uint8_t * ps_buf = NULL; ++ ++ ff_avc_write_annexb_extradata(side_data, &ps_buf, &ps_size); ++ av_log(s1, AV_LOG_TRACE, "H264: write side data=%d\n", ps_size); ++ ff_rtp_send_h264_hevc(s1, ps_buf ? ps_buf : side_data, ps_size); ++ av_free(ps_buf); ++ } + ff_rtp_send_h264_hevc(s1, pkt->data, size); + break; ++ } + case AV_CODEC_ID_H261: + ff_rtp_send_h261(s1, pkt->data, size); + break; +diff --git a/libavformat/utils.c b/libavformat/utils.c +index 75e5350a27..e10b493dae 100644 +--- a/libavformat/utils.c ++++ b/libavformat/utils.c +@@ -3013,6 +3013,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr) + return 1; + } + ++#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER ++// This should be quite general purpose but avoid possible conflicts ++// by limiting usage to cases wehere we know it works. ++static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts) ++{ ++ // Only try fallback if we know it is supported (HEVC only) ++ const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL : ++ avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE); ++ int err; ++ ++ // Failed to find fallback or we are already at the fallback ++ if (new_codec == NULL || new_codec == old_codec) ++ { ++ return AVERROR_DECODER_NOT_FOUND; ++ } ++ ++ // * This may be dodgy - header says to not use this fn, ++ // especially if we are going to reopen the context... ++ // (but it does seem to work for our cases) ++ if (avcodec_is_open(avctx)) { ++ avcodec_close(avctx); ++ } ++ ++ if ((err = avcodec_open2(avctx, new_codec, opts)) < 0) ++ { ++ return err; ++ } ++ ++ return 0; ++} ++#else ++#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND) ++#endif ++ + /* returns 1 or 0 if or if not decoded data was returned, or a negative error */ + static int try_decode_frame(AVFormatContext *s, AVStream *st, + const AVPacket *avpkt, AVDictionary **options) +@@ -3051,7 +3085,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, + av_dict_set(options ? options : &thread_opt, "lowres", "0", 0); + if (s->codec_whitelist) + av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0); +- ret = avcodec_open2(avctx, codec, options ? options : &thread_opt); ++ if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND) ++ { ++ // Try fallback if if looks worth a try ++ ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt); ++ } + if (!options) + av_dict_free(&thread_opt); + if (ret < 0) { +@@ -3082,6 +3120,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, + if (avctx->codec_type == AVMEDIA_TYPE_VIDEO || + avctx->codec_type == AVMEDIA_TYPE_AUDIO) { + ret = avcodec_send_packet(avctx, &pkt); ++ ++ // If we are going to want to fall back we should know here ++ if (ret == AVERROR_DECODER_NOT_FOUND) { ++ if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0) ++ break; ++ continue; ++ } ++ + if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF) + break; + if (ret >= 0) +@@ -3710,9 +3756,20 @@ FF_ENABLE_DEPRECATION_WARNINGS + // Try to just open decoders, in case this is enough to get parameters. + if (!has_codec_parameters(st, NULL) && st->internal->request_probe <= 0) { + if (codec && !avctx->codec) +- if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0) +- av_log(ic, AV_LOG_WARNING, +- "Failed to open codec in %s\n",__FUNCTION__); ++ { ++ int err; ++ ++ if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0) ++ { ++ if (err == AVERROR_DECODER_NOT_FOUND) { ++ err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt); ++ } ++ if (err < 0) { ++ av_log(ic, AV_LOG_WARNING, ++ "Failed to open codec in %s\n",__FUNCTION__); ++ } ++ } ++ } + } + if (!options) + av_dict_free(&thread_opt); +diff --git a/libavutil/Makefile b/libavutil/Makefile +index 27bafe9e12..c9075ddf8a 100644 +--- a/libavutil/Makefile ++++ b/libavutil/Makefile +@@ -68,6 +68,7 @@ HEADERS = adler32.h \ + rational.h \ + replaygain.h \ + ripemd.h \ ++ rpi_sand_fns.h \ + samplefmt.h \ + sha.h \ + sha512.h \ +@@ -87,6 +88,7 @@ HEADERS = adler32.h \ + film_grain_params.h \ + + HEADERS-$(CONFIG_LZO) += lzo.h ++HEADERS-$(CONFIG-RPI) += rpi_sand_fn_pw.h + + ARCH_HEADERS = bswap.h \ + intmath.h \ +@@ -182,6 +184,7 @@ OBJS-$(CONFIG_LZO) += lzo.o + OBJS-$(CONFIG_MEDIACODEC) += hwcontext_mediacodec.o + OBJS-$(CONFIG_OPENCL) += hwcontext_opencl.o + OBJS-$(CONFIG_QSV) += hwcontext_qsv.o ++OBJS-$(CONFIG_SAND) += rpi_sand_fns.o + OBJS-$(CONFIG_VAAPI) += hwcontext_vaapi.o + OBJS-$(CONFIG_VIDEOTOOLBOX) += hwcontext_videotoolbox.o + OBJS-$(CONFIG_VDPAU) += hwcontext_vdpau.o +diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile +index 5613813ba8..ab8bcfcf34 100644 +--- a/libavutil/aarch64/Makefile ++++ b/libavutil/aarch64/Makefile +@@ -1,4 +1,6 @@ + OBJS += aarch64/cpu.o \ + aarch64/float_dsp_init.o \ + +-NEON-OBJS += aarch64/float_dsp_neon.o ++NEON-OBJS += aarch64/float_dsp_neon.o \ ++ aarch64/rpi_sand_neon.o \ ++ +diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S +new file mode 100644 +index 0000000000..11658de0c8 +--- /dev/null ++++ b/libavutil/aarch64/rpi_sand_neon.S +@@ -0,0 +1,672 @@ ++/* ++Copyright (c) 2021 Michael Eiler ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: Michael Eiler ++*/ ++ ++#include "asm.S" ++ ++// void ff_rpi_sand8_lines_to_planar_y8( ++// uint8_t * dest, : x0 ++// unsigned int dst_stride, : w1 ++// const uint8_t * src, : x2 ++// unsigned int src_stride1, : w3, always 128 ++// unsigned int src_stride2, : w4 ++// unsigned int _x, : w5 ++// unsigned int y, : w6 ++// unsigned int _w, : w7 ++// unsigned int h); : [sp, #0] ++ ++function ff_rpi_sand8_lines_to_planar_y8, export=1 ++ // w15 contains the number of rows we need to process ++ ldr w15, [sp, #0] ++ ++ // w8 will contain the number of blocks per row ++ // w8 = floor(_w/stride1) ++ // stride1 is assumed to always be 128 ++ mov w8, w1 ++ lsr w8, w8, #7 ++ ++ // in case the width of the image is not a multiple of 128, there will ++ // be an incomplete block at the end of every row ++ // w9 contains the number of pixels stored within this block ++ // w9 = _w - w8 * 128 ++ lsl w9, w8, #7 ++ sub w9, w7, w9 ++ ++ // this is the value we have to add to the src pointer after reading a complete block ++ // it will move the address to the start of the next block ++ // w10 = stride2 * stride1 - stride1 ++ mov w10, w4 ++ lsl w10, w10, #7 ++ sub w10, w10, #128 ++ ++ // w11 is the row offset, meaning the start offset of the first block of every collumn ++ // this will be increased with stride1 within every iteration of the row_loop ++ eor w11, w11, w11 ++ ++ // w12 = 0, processed row count ++ eor w12, w12, w12 ++row_loop: ++ // start of the first block within the current row ++ // x13 = row offset + src ++ mov x13, x2 ++ add x13, x13, x11 ++ ++ // w14 = 0, processed block count ++ eor w14, w14, w14 ++ ++ cmp w8, #0 ++ beq no_main_y8 ++ ++block_loop: ++ // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128 ++ // fortunately these aren't callee saved ones, meaning we don't need to backup them ++ ld1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x13], #64 ++ ld1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x13], #64 ++ ++ // write these registers back to the destination vector and increase the dst address by 128 ++ st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64 ++ st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x0], #64 ++ ++ // move the source register to the beginning of the next block (x13 = src + block offset) ++ add x13, x13, x10 ++ // increase the block counter ++ add w14, w14, #1 ++ ++ // continue with the block_loop if we haven't copied all full blocks yet ++ cmp w8, w14 ++ bgt block_loop ++ ++ // handle the last block at the end of each row ++ // at most 127 byte values copied from src to dst ++no_main_y8: ++ eor w5, w5, w5 // i = 0 ++incomplete_block_loop_y8: ++ cmp w5, w9 ++ bge incomplete_block_loop_end_y8 ++ ++ ldrb w6, [x13] ++ strb w6, [x0] ++ add x13, x13, #1 ++ add x0, x0, #1 ++ ++ add w5, w5, #1 ++ b incomplete_block_loop_y8 ++incomplete_block_loop_end_y8: ++ ++ ++ // increase the row offset by 128 (stride1) ++ add w11, w11, #128 ++ // increment the row counter ++ add w12, w12, #1 ++ ++ // process the next row if we haven't finished yet ++ cmp w15, w12 ++ bgt row_loop ++ ++ ret ++endfunc ++ ++ ++ ++// void ff_rpi_sand8_lines_to_planar_c8( ++// uint8_t * dst_u, : x0 ++// unsigned int dst_stride_u, : w1 == width ++// uint8_t * dst_v, : x2 ++// unsigned int dst_stride_v, : w3 == width ++// const uint8_t * src, : x4 ++// unsigned int stride1, : w5 == 128 ++// unsigned int stride2, : w6 ++// unsigned int _x, : w7 ++// unsigned int y, : [sp, #0] ++// unsigned int _w, : [sp, #8] ++// unsigned int h); : [sp, #16] ++ ++function ff_rpi_sand8_lines_to_planar_c8, export=1 ++ // w7 = width ++ ldr w7, [sp, #8] ++ ++ // w15 contains the number of rows we need to process ++ // counts down ++ ldr w15, [sp, #16] ++ ++ // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6 ++ mov w8, w7 ++ lsr w8, w8, #6 ++ ++ // number of pixels in block at the end of every row ++ // w9 = _w - (w8 * 64) ++ lsl w9, w8, #6 ++ sub w9, w7, w9 ++ ++ // Skip at the end of the line to account for stride ++ sub w12, w1, w7 ++ ++ // address delta to the beginning of the next block ++ // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128 ++ lsl w10, w6, #7 ++ sub w10, w10, #128 ++ ++ // w11 = row address start offset = 0 ++ eor w11, w11, w11 ++ ++row_loop_c8: ++ // start of the first block within the current row ++ // x13 = row offset + src ++ mov x13, x4 ++ add x13, x13, x11 ++ ++ // w14 = 0, processed block count ++ eor w14, w14, w14 ++ ++ cmp w8, #0 ++ beq no_main_c8 ++ ++block_loop_c8: ++ // load the full block -> 128 bytes, the block contains 64 interleaved U and V values ++ ld2 { v0.16b, v1.16b }, [x13], #32 ++ ld2 { v2.16b, v3.16b }, [x13], #32 ++ ld2 { v4.16b, v5.16b }, [x13], #32 ++ ld2 { v6.16b, v7.16b }, [x13], #32 ++ ++ // swap register so that we can write them out with a single instruction ++ mov v16.16b, v1.16b ++ mov v17.16b, v3.16b ++ mov v18.16b, v5.16b ++ mov v1.16b, v2.16b ++ mov v2.16b, v4.16b ++ mov v3.16b, v6.16b ++ mov v4.16b, v16.16b ++ mov v5.16b, v17.16b ++ mov v6.16b, v18.16b ++ ++ st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64 ++ st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x2], #64 ++ ++ // increment row counter and move src to the beginning of the next block ++ add w14, w14, #1 ++ add x13, x13, x10 ++ ++ // jump to block_loop_c8 iff the block count is smaller than the number of full blocks ++ cmp w8, w14 ++ bgt block_loop_c8 ++ ++no_main_c8: ++ // handle incomplete block at the end of every row ++ eor w5, w5, w5 // point counter, this might be ++incomplete_block_loop_c8: ++ cmp w5, w9 ++ bge incomplete_block_loop_end_c8 ++ ++ ldrb w1, [x13] ++ strb w1, [x0] ++ add x13, x13, #1 ++ ++ ldrb w1, [x13] ++ strb w1, [x2] ++ add x13, x13, #1 ++ ++ add x0, x0, #1 ++ add x2, x2, #1 ++ ++ add w5, w5, #1 ++ b incomplete_block_loop_c8 ++incomplete_block_loop_end_c8: ++ ++ // increase row_offset by stride1 ++ add w11, w11, #128 ++ add x0, x0, w12, sxtw ++ add x2, x2, w12, sxtw ++ ++ // jump to row_Loop_c8 iff the row count is small than the height ++ subs w15, w15, #1 ++ bgt row_loop_c8 ++ ++ ret ++endfunc ++ ++// Unzip chroma ++// ++// On entry: ++// a0 = V0, U2, ... ++// a1 = U0, V1, ... ++// a2 = U1, V2, ... ++// b0 = V8, U10, ... ++// b1 = U8, V9, ... ++// b2 = U9, V10, ... ++// ++// On exit: ++// d0 = U0, U3, ... ++// ... ++// a0 = V0, V3, .. ++// ... ++// ++// Reg order for USAND is a1, a0, a2 (i.e. swap natural order of 1st 2 dest regs) ++ ++.macro UZPH_C d0, d1, d2, a0, a1, a2, b0, b1, b2 ++ uzp1 \d0\().8h, \a1\().8h, \b1\().8h ++ uzp1 \d1\().8h, \a2\().8h, \b2\().8h ++ uzp2 \d2\().8h, \a0\().8h, \b0\().8h ++ ++ uzp1 \a0\().8h, \a0\().8h, \b0\().8h ++ uzp2 \a1\().8h, \a1\().8h, \b1\().8h ++ uzp2 \a2\().8h, \a2\().8h, \b2\().8h ++.endm ++ ++// SAND30 -> 10bit ++.macro USAND10 d0, d1, d2, a0, a1 ++ shrn \d2\().4h, \a0\().4s, #14 ++ shrn \d1\().4h, \a0\().4s, #10 ++ ++ shrn2 \d2\().8h, \a1\().4s, #14 ++ shrn2 \d1\().8h, \a1\().4s, #10 ++ uzp1 \d0\().8h, \a0\().8h, \a1\().8h ++ ++ ushr \d2\().8h, \d2\().8h, #6 ++ bic \d0\().8h, #0xfc, lsl #8 ++ bic \d1\().8h, #0xfc, lsl #8 ++.endm ++ ++// SAND30 -> 8bit ++.macro USAND8 d0, d1, d2, a0, a1, a2, a3, t0, t1, t2 ++ shrn \d1\().4h, \a0\().4s, #12 ++ shrn2 \d1\().8h, \a1\().4s, #12 ++ uzp1 \d0\().8h, \a0\().8h, \a1\().8h ++ uzp2 \d2\().8h, \a0\().8h, \a1\().8h ++ ++ shrn \t1\().4h, \a2\().4s, #12 ++ shrn2 \t1\().8h, \a3\().4s, #12 ++ uzp1 \t0\().8h, \a2\().8h, \a3\().8h ++ uzp2 \t2\().8h, \a2\().8h, \a3\().8h ++ ++ shrn \d0\().8b, \d0\().8h, #2 ++ shrn2 \d0\().16b, \t0\().8h, #2 ++ shrn \d2\().8b, \d2\().8h, #6 ++ shrn2 \d2\().16b, \t2\().8h, #6 ++ uzp1 \d1\().16b, \d1\().16b, \t1\().16b ++.endm ++ ++ ++// void ff_rpi_sand30_lines_to_planar_c16( ++// uint8_t * dst_u, // [x0] ++// unsigned int dst_stride_u, // [w1] ++// uint8_t * dst_v, // [x2] ++// unsigned int dst_stride_v, // [w3] ++// const uint8_t * src, // [x4] ++// unsigned int stride1, // [w5] 128 ++// unsigned int stride2, // [w6] ++// unsigned int _x, // [w7] 0 ++// unsigned int y, // [sp, #0] ++// unsigned int _w, // [sp, #8] w9 ++// unsigned int h); // [sp, #16] w10 ++ ++function ff_rpi_sand30_lines_to_planar_c16, export=1 ++ ldr w7, [sp, #0] // y ++ ldr w8, [sp, #8] // _w ++ ldr w10, [sp, #16] // h ++ lsl w6, w6, #7 // Fixup stride2 ++ sub w6, w6, #64 ++ uxtw x6, w6 ++ sub w1, w1, w8, LSL #1 // Fixup chroma strides ++ sub w3, w3, w8, LSL #1 ++ lsl w7, w7, #7 // Add y to src ++ add x4, x4, w7, UXTW ++10: ++ mov w13, #0 ++ mov x5, x4 ++ mov w9, w8 ++1: ++ ld1 {v0.4s-v3.4s}, [x5], #64 ++ ld1 {v4.4s-v7.4s}, [x5], x6 ++ subs w9, w9, #48 ++ ++ USAND10 v17, v16, v18, v0, v1 ++ USAND10 v20, v19, v21, v2, v3 ++ UZPH_C v0, v1, v2, v16, v17, v18, v19, v20, v21 ++ USAND10 v23, v22, v24, v4, v5 ++ USAND10 v26, v25, v27, v6, v7 ++ UZPH_C v4, v5, v6, v22, v23, v24, v25, v26, v27 ++ ++ blt 2f ++ ++ st3 {v0.8h-v2.8h}, [x0], #48 ++ st3 {v4.8h-v6.8h}, [x0], #48 ++ st3 {v16.8h-v18.8h}, [x2], #48 ++ st3 {v22.8h-v24.8h}, [x2], #48 ++ ++ bne 1b ++11: ++ subs w10, w10, #1 ++ add x4, x4, #128 ++ add x0, x0, w1, UXTW ++ add x2, x2, w3, UXTW ++ bne 10b ++99: ++ ret ++ ++// Partial final write ++2: ++ cmp w9, #24-48 ++ blt 1f ++ st3 {v0.8h - v2.8h}, [x0], #48 ++ st3 {v16.8h - v18.8h}, [x2], #48 ++ beq 11b ++ mov v0.16b, v4.16b ++ mov v1.16b, v5.16b ++ sub w9, w9, #24 ++ mov v2.16b, v6.16b ++ mov v16.16b, v22.16b ++ mov v17.16b, v23.16b ++ mov v18.16b, v24.16b ++1: ++ cmp w9, #12-48 ++ blt 1f ++ st3 {v0.4h - v2.4h}, [x0], #24 ++ st3 {v16.4h - v18.4h}, [x2], #24 ++ beq 11b ++ mov v0.2d[0], v0.2d[1] ++ sub w9, w9, #12 ++ mov v1.2d[0], v1.2d[1] ++ mov v2.2d[0], v2.2d[1] ++ mov v16.2d[0], v16.2d[1] ++ mov v17.2d[0], v17.2d[1] ++ mov v18.2d[0], v18.2d[1] ++1: ++ cmp w9, #6-48 ++ blt 1f ++ st3 {v0.h - v2.h}[0], [x0], #6 ++ st3 {v0.h - v2.h}[1], [x0], #6 ++ st3 {v16.h - v18.h}[0], [x2], #6 ++ st3 {v16.h - v18.h}[1], [x2], #6 ++ beq 11b ++ mov v0.s[0], v0.s[1] ++ sub w9, w9, #6 ++ mov v1.s[0], v1.s[1] ++ mov v2.s[0], v2.s[1] ++ mov v16.s[0], v16.s[1] ++ mov v17.s[0], v17.s[1] ++ mov v18.s[0], v18.s[1] ++1: ++ cmp w9, #3-48 ++ blt 1f ++ st3 {v0.h - v2.h}[0], [x0], #6 ++ st3 {v16.h - v18.h}[0], [x2], #6 ++ beq 11b ++ mov v0.h[0], v0.h[1] ++ sub w9, w9, #3 ++ mov v1.h[0], v1.h[1] ++ mov v16.h[0], v16.h[1] ++ mov v17.h[0], v17.h[1] ++1: ++ cmp w9, #2-48 ++ blt 1f ++ st2 {v0.h - v1.h}[0], [x0], #4 ++ st2 {v16.h - v17.h}[0], [x2], #4 ++ b 11b ++1: ++ st1 {v0.h}[0], [x0], #2 ++ st1 {v16.h}[0], [x2], #2 ++ b 11b ++endfunc ++ ++ ++//void ff_rpi_sand30_lines_to_planar_p010( ++// uint8_t * dest, ++// unsigned int dst_stride, ++// const uint8_t * src, ++// unsigned int src_stride1, ++// unsigned int src_stride2, ++// unsigned int _x, ++// unsigned int y, ++// unsigned int _w, ++// unsigned int h); ++ ++// void ff_rpi_sand30_lines_to_planar_y8( ++// uint8_t * dest, : x0 ++// unsigned int dst_stride, : w1 ++// const uint8_t * src, : x2 ++// unsigned int src_stride1, : w3, always 128 ++// unsigned int src_stride2, : w4 ++// unsigned int _x, : w5 ++// unsigned int y, : w6 ++// unsigned int _w, : w7 ++// unsigned int h); : [sp, #0] ++// ++// Assumes that we are starting on a stripe boundary and that overreading ++// within the stripe is OK. However it does respect the dest size for wri ++ ++function ff_rpi_sand30_lines_to_planar_y16, export=1 ++ lsl w4, w4, #7 ++ sub w4, w4, #64 ++ uxtw x4, w4 ++ sub w1, w1, w7, lsl #1 ++ uxtw x6, w6 ++ add x8, x2, x6, lsl #7 ++ ldr w6, [sp, #0] ++ ++10: ++ mov x2, x8 ++ mov w5, w7 ++1: ++ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 ++ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4 ++ ++ subs w5, w5, #96 ++ ++ USAND10 v16, v17, v18, v0, v1 ++ USAND10 v19, v20, v21, v2, v3 ++ USAND10 v22, v23, v24, v4, v5 ++ USAND10 v25, v26, v27, v6, v7 ++ ++ blt 2f ++ ++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 ++ st3 {v19.8h, v20.8h, v21.8h}, [x0], #48 ++ st3 {v22.8h, v23.8h, v24.8h}, [x0], #48 ++ st3 {v25.8h, v26.8h, v27.8h}, [x0], #48 ++ ++ bne 1b ++ ++11: ++ subs w6, w6, #1 ++ add x0, x0, w1, uxtw ++ add x8, x8, #128 ++ bne 10b ++ ++ ret ++ ++// Partial final write ++2: ++ cmp w5, #48-96 ++ blt 1f ++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 ++ st3 {v19.8h, v20.8h, v21.8h}, [x0], #48 ++ beq 11b ++ mov v16.16b, v22.16b ++ mov v17.16b, v23.16b ++ sub w5, w5, #48 ++ mov v18.16b, v24.16b ++ mov v19.16b, v25.16b ++ mov v20.16b, v26.16b ++ mov v21.16b, v27.16b ++1: ++ cmp w5, #24-96 ++ blt 1f ++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 ++ beq 11b ++ mov v16.16b, v19.16b ++ mov v17.16b, v20.16b ++ sub w5, w5, #24 ++ mov v18.16b, v21.16b ++1: ++ cmp w5, #12-96 ++ blt 1f ++ st3 {v16.4h, v17.4h, v18.4h}, [x0], #24 ++ beq 11b ++ mov v16.2d[0], v16.2d[1] ++ sub w5, w5, #12 ++ mov v17.2d[0], v17.2d[1] ++ mov v18.2d[0], v18.2d[1] ++1: ++ cmp w5, #6-96 ++ blt 1f ++ st3 {v16.h, v17.h, v18.h}[0], [x0], #6 ++ st3 {v16.h, v17.h, v18.h}[1], [x0], #6 ++ beq 11b ++ mov v16.2s[0], v16.2s[1] ++ sub w5, w5, #6 ++ mov v17.2s[0], v17.2s[1] ++ mov v18.2s[0], v18.2s[1] ++1: ++ cmp w5, #3-96 ++ blt 1f ++ st3 {v16.h, v17.h, v18.h}[0], [x0], #6 ++ beq 11b ++ mov v16.4h[0], v16.4h[1] ++ sub w5, w5, #3 ++ mov v17.4h[0], v17.4h[1] ++1: ++ cmp w5, #2-96 ++ blt 1f ++ st2 {v16.h, v17.h}[0], [x0], #4 ++ b 11b ++1: ++ st1 {v16.h}[0], [x0], #2 ++ b 11b ++ ++endfunc ++ ++// void ff_rpi_sand30_lines_to_planar_y8( ++// uint8_t * dest, : x0 ++// unsigned int dst_stride, : w1 ++// const uint8_t * src, : x2 ++// unsigned int src_stride1, : w3, always 128 ++// unsigned int src_stride2, : w4 ++// unsigned int _x, : w5 ++// unsigned int y, : w6 ++// unsigned int _w, : w7 ++// unsigned int h); : [sp, #0] ++// ++// Assumes that we are starting on a stripe boundary and that overreading ++// within the stripe is OK. However it does respect the dest size for wri ++ ++function ff_rpi_sand30_lines_to_planar_y8, export=1 ++ lsl w4, w4, #7 ++ sub w4, w4, #64 ++ uxtw x4, w4 ++ sub w1, w1, w7 ++ uxtw x6, w6 ++ add x8, x2, x6, lsl #7 ++ ldr w6, [sp, #0] ++ ++10: ++ mov x2, x8 ++ mov w5, w7 ++1: ++ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 ++ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4 ++ ++ subs w5, w5, #96 ++ ++ // v0, v1 ++ USAND8 v16, v17, v18, v0, v1, v2, v3, v22, v23, v24 ++ USAND8 v19, v20, v21, v4, v5, v6, v7, v22, v23, v24 ++ ++ blt 2f ++ ++ st3 {v16.16b, v17.16b, v18.16b}, [x0], #48 ++ st3 {v19.16b, v20.16b, v21.16b}, [x0], #48 ++ ++ bne 1b ++ ++11: ++ subs w6, w6, #1 ++ add x0, x0, w1, uxtw ++ add x8, x8, #128 ++ bne 10b ++ ++ ret ++ ++// Partial final write ++2: ++ cmp w5, #48-96 ++ blt 1f ++ st3 {v16.16b, v17.16b, v18.16b}, [x0], #48 ++ beq 11b ++ mov v16.16b, v22.16b ++ mov v17.16b, v23.16b ++ sub w5, w5, #48 ++ mov v18.16b, v24.16b ++1: ++ cmp w5, #24-96 ++ blt 1f ++ st3 {v16.8b, v17.8b, v18.8b}, [x0], #24 ++ beq 11b ++ mov v16.2d[0], v16.2d[1] ++ sub w5, w5, #24 ++ mov v17.2d[0], v17.2d[1] ++ mov v18.2d[0], v18.2d[1] ++1: ++ cmp w5, #12-96 ++ blt 1f ++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 ++ st3 {v16.b, v17.b, v18.b}[1], [x0], #3 ++ st3 {v16.b, v17.b, v18.b}[2], [x0], #3 ++ st3 {v16.b, v17.b, v18.b}[3], [x0], #3 ++ beq 11b ++ mov v16.2s[0], v16.2s[1] ++ sub w5, w5, #12 ++ mov v17.2s[0], v17.2s[1] ++ mov v18.2s[0], v18.2s[1] ++1: ++ cmp w5, #6-96 ++ blt 1f ++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 ++ st3 {v16.b, v17.b, v18.b}[1], [x0], #3 ++ beq 11b ++ mov v16.4h[0], v16.4h[1] ++ sub w5, w5, #6 ++ mov v17.4h[0], v17.4h[1] ++ mov v18.4h[0], v18.4h[1] ++1: ++ cmp w5, #3-96 ++ blt 1f ++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 ++ beq 11b ++ mov v16.8b[0], v16.8b[1] ++ sub w5, w5, #3 ++ mov v17.8b[0], v17.8b[1] ++1: ++ cmp w5, #2-96 ++ blt 1f ++ st2 {v16.b, v17.b}[0], [x0], #2 ++ b 11b ++1: ++ st1 {v16.b}[0], [x0], #1 ++ b 11b ++ ++endfunc ++ +diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h +new file mode 100644 +index 0000000000..2a56135bc3 +--- /dev/null ++++ b/libavutil/aarch64/rpi_sand_neon.h +@@ -0,0 +1,59 @@ ++/* ++Copyright (c) 2021 Michael Eiler ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: Michael Eiler ++*/ ++ ++#pragma once ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride, ++ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, ++ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); ++ ++void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u, ++ uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride, ++ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, ++ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); ++ ++void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u, ++ uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1, ++ unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); ++ ++void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride, ++ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, ++ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); ++ ++#ifdef __cplusplus ++} ++#endif ++ +diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile +index 5da44b0542..b74b7c4e2f 100644 +--- a/libavutil/arm/Makefile ++++ b/libavutil/arm/Makefile +@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o \ + + NEON-OBJS += arm/float_dsp_init_neon.o \ + arm/float_dsp_neon.o \ ++ arm/rpi_sand_neon.o \ +diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S +new file mode 100644 +index 0000000000..60e697f681 +--- /dev/null ++++ b/libavutil/arm/rpi_sand_neon.S +@@ -0,0 +1,925 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++#include "libavutil/arm/asm.S" ++ ++ ++@ General notes: ++@ Having done some timing on this in sand8->y8 (Pi4) ++@ vst1 (680fps) is a bit faster than vstm (660fps) ++@ vldm (680fps) is noticably faster than vld1 (480fps) ++@ (or it might be that a mix is what is required) ++@ ++@ At least on a Pi4 it is no more expensive to have a single auto-inc register ++@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted ++@ the latter was better) ++@ ++@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless ++@ the memory is uncached. ++@ As these are Sand -> planar we can assume that src is going to be aligned but ++@ it is possible that dest isn't (converting to .yuv or other packed format). ++@ Luckily vst1 is faster than vstm :-) so all is well ++@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4 ++@ .8 stores would let us do non-word aligned stores into uncached but it ++@ probably isn't worth it. ++ ++ ++ ++ ++@ void ff_rpi_sand128b_stripe_to_8_10( ++@ uint8_t * dest, // [r0] ++@ const uint8_t * src1, // [r1] ++@ const uint8_t * src2, // [r2] ++@ unsigned int lines); // [r3] ++ ++.macro stripe2_to_8, bit_depth ++ vpush {q4-q7} ++1: ++ vldm r1!, {q0-q7} ++ subs r3, #1 ++ vldm r2!, {q8-q15} ++ vqrshrn.u16 d0, q0, #\bit_depth - 8 ++ vqrshrn.u16 d1, q1, #\bit_depth - 8 ++ vqrshrn.u16 d2, q2, #\bit_depth - 8 ++ vqrshrn.u16 d3, q3, #\bit_depth - 8 ++ vqrshrn.u16 d4, q4, #\bit_depth - 8 ++ vqrshrn.u16 d5, q5, #\bit_depth - 8 ++ vqrshrn.u16 d6, q6, #\bit_depth - 8 ++ vqrshrn.u16 d7, q7, #\bit_depth - 8 ++ vqrshrn.u16 d8, q8, #\bit_depth - 8 ++ vqrshrn.u16 d9, q9, #\bit_depth - 8 ++ vqrshrn.u16 d10, q10, #\bit_depth - 8 ++ vqrshrn.u16 d11, q11, #\bit_depth - 8 ++ vqrshrn.u16 d12, q12, #\bit_depth - 8 ++ vqrshrn.u16 d13, q13, #\bit_depth - 8 ++ vqrshrn.u16 d14, q14, #\bit_depth - 8 ++ vqrshrn.u16 d15, q15, #\bit_depth - 8 ++ vstm r0!, {q0-q7} ++ bne 1b ++ vpop {q4-q7} ++ bx lr ++.endm ++ ++function ff_rpi_sand128b_stripe_to_8_10, export=1 ++ stripe2_to_8 10 ++endfunc ++ ++@ void ff_rpi_sand8_lines_to_planar_y8( ++@ uint8_t * dest, // [r0] ++@ unsigned int dst_stride, // [r1] ++@ const uint8_t * src, // [r2] ++@ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++@ unsigned int src_stride2, // [sp, #0] -> r3 ++@ unsigned int _x, // [sp, #4] Ignored - 0 ++@ unsigned int y, // [sp, #8] (r7 in prefix) ++@ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++@ unsigned int h); // [sp, #16] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand8_lines_to_planar_y8, export=1 ++ push {r4-r8, lr} @ +24 L ++ ldr r3, [sp, #24] ++ ldr r6, [sp, #36] ++ ldr r7, [sp, #32] @ y ++ lsl r3, #7 ++ sub r1, r6 ++ add r8, r2, r7, lsl #7 ++ ldr r7, [sp, #40] ++ ++10: ++ mov r2, r8 ++ add r4, r0, #24 ++ mov r5, r6 ++ mov lr, #0 ++1: ++ vldm r2, {q8-q15} ++ add r2, r3 ++ subs r5, #128 ++ blt 2f ++ vst1.8 {d16, d17, d18, d19}, [r0]! ++ vst1.8 {d20, d21, d22, d23}, [r0]! ++ vst1.8 {d24, d25, d26, d27}, [r0]! ++ vst1.8 {d28, d29, d30, d31}, [r0]! ++ bne 1b ++11: ++ subs r7, #1 ++ add r0, r1 ++ add r8, #128 ++ bne 10b ++ ++ pop {r4-r8, pc} ++ ++@ Partial final write ++2: ++ cmp r5, #64-128 ++ blt 1f ++ vst1.8 {d16, d17, d18, d19}, [r0]! ++ vst1.8 {d20, d21, d22, d23}, [r0]! ++ beq 11b ++ vmov q8, q12 ++ vmov q9, q13 ++ sub r5, #64 ++ vmov q10, q14 ++ vmov q11, q15 ++1: ++ cmp r5, #32-128 ++ blt 1f ++ vst1.8 {d16, d17, d18, d19}, [r0]! ++ beq 11b ++ vmov q8, q10 ++ sub r5, #32 ++ vmov q9, q11 ++1: ++ cmp r5, #16-128 ++ blt 1f ++ vst1.8 {d16, d17}, [r0]! ++ beq 11b ++ sub r5, #16 ++ vmov q8, q9 ++1: ++ cmp r5, #8-128 ++ blt 1f ++ vst1.8 {d16}, [r0]! ++ beq 11b ++ sub r5, #8 ++ vmov d16, d17 ++1: ++ cmp r5, #4-128 ++ blt 1f ++ vst1.32 {d16[0]}, [r0]! ++ beq 11b ++ sub r5, #4 ++ vshr.u64 d16, #32 ++1: ++ cmp r5, #2-128 ++ blt 1f ++ vst1.16 {d16[0]}, [r0]! ++ beq 11b ++ vst1.8 {d16[2]}, [r0]! ++ b 11b ++1: ++ vst1.8 {d16[0]}, [r0]! ++ b 11b ++endfunc ++ ++@ void ff_rpi_sand8_lines_to_planar_c8( ++@ uint8_t * dst_u, // [r0] ++@ unsigned int dst_stride_u, // [r1] ++@ uint8_t * dst_v, // [r2] ++@ unsigned int dst_stride_v, // [r3] ++@ const uint8_t * src, // [sp, #0] -> r4, r5 ++@ unsigned int stride1, // [sp, #4] 128 ++@ unsigned int stride2, // [sp, #8] -> r8 ++@ unsigned int _x, // [sp, #12] 0 ++@ unsigned int y, // [sp, #16] (r7 in prefix) ++@ unsigned int _w, // [sp, #20] -> r12, r6 ++@ unsigned int h); // [sp, #24] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand8_lines_to_planar_c8, export=1 ++ push {r4-r8, lr} @ +24 ++ ++ ldr r5, [sp, #24] ++ ldr r8, [sp, #32] ++ ldr r7, [sp, #40] ++ ldr r6, [sp, #44] ++ lsl r8, #7 ++ add r5, r5, r7, lsl #7 ++ sub r1, r1, r6 ++ sub r3, r3, r6 ++ ldr r7, [sp, #48] ++ vpush {q4-q7} ++ ++10: ++ mov r4, r5 ++ mov r12, r6 ++1: ++ subs r12, #64 ++ vldm r4, {q0-q7} ++ add r4, r8 ++ it gt ++ vldmgt r4, {q8-q15} ++ add r4, r8 ++ ++ vuzp.8 q0, q1 ++ vuzp.8 q2, q3 ++ vuzp.8 q4, q5 ++ vuzp.8 q6, q7 ++ ++ vuzp.8 q8, q9 ++ vuzp.8 q10, q11 ++ vuzp.8 q12, q13 ++ vuzp.8 q14, q15 ++ subs r12, #64 ++ ++ @ Rearrange regs so we can use vst1 with 4 regs ++ vswp q1, q2 ++ vswp q5, q6 ++ vswp q9, q10 ++ vswp q13, q14 ++ blt 2f ++ ++ vst1.8 {d0, d1, d2, d3 }, [r0]! ++ vst1.8 {d8, d9, d10, d11}, [r0]! ++ vst1.8 {d16, d17, d18, d19}, [r0]! ++ vst1.8 {d24, d25, d26, d27}, [r0]! ++ ++ vst1.8 {d4, d5, d6, d7 }, [r2]! ++ vst1.8 {d12, d13, d14, d15}, [r2]! ++ vst1.8 {d20, d21, d22, d23}, [r2]! ++ vst1.8 {d28, d29, d30, d31}, [r2]! ++ bne 1b ++11: ++ subs r7, #1 ++ add r5, #128 ++ add r0, r1 ++ add r2, r3 ++ bne 10b ++ vpop {q4-q7} ++ pop {r4-r8,pc} ++ ++2: ++ cmp r12, #64-128 ++ blt 1f ++ vst1.8 {d0, d1, d2, d3 }, [r0]! ++ vst1.8 {d8, d9, d10, d11}, [r0]! ++ vst1.8 {d4, d5, d6, d7 }, [r2]! ++ vst1.8 {d12, d13, d14, d15}, [r2]! ++ beq 11b ++ sub r12, #64 ++ vmov q0, q8 ++ vmov q1, q9 ++ vmov q2, q10 ++ vmov q3, q11 ++ vmov q4, q12 ++ vmov q5, q13 ++ vmov q6, q14 ++ vmov q7, q15 ++1: ++ cmp r12, #32-128 ++ blt 1f ++ vst1.8 {d0, d1, d2, d3 }, [r0]! ++ vst1.8 {d4, d5, d6, d7 }, [r2]! ++ beq 11b ++ sub r12, #32 ++ vmov q0, q4 ++ vmov q1, q5 ++ vmov q2, q6 ++ vmov q3, q7 ++1: ++ cmp r12, #16-128 ++ blt 1f ++ vst1.8 {d0, d1 }, [r0]! ++ vst1.8 {d4, d5 }, [r2]! ++ beq 11b ++ sub r12, #16 ++ vmov q0, q1 ++ vmov q2, q3 ++1: ++ cmp r12, #8-128 ++ blt 1f ++ vst1.8 {d0}, [r0]! ++ vst1.8 {d4}, [r2]! ++ beq 11b ++ sub r12, #8 ++ vmov d0, d1 ++ vmov d4, d5 ++1: ++ cmp r12, #4-128 ++ blt 1f ++ vst1.32 {d0[0]}, [r0]! ++ vst1.32 {d4[0]}, [r2]! ++ beq 11b ++ sub r12, #4 ++ vmov s0, s1 ++ vmov s8, s9 ++1: ++ cmp r12, #2-128 ++ blt 1f ++ vst1.16 {d0[0]}, [r0]! ++ vst1.16 {d4[0]}, [r2]! ++ beq 11b ++ vst1.8 {d0[2]}, [r0]! ++ vst1.8 {d4[2]}, [r2]! ++ b 11b ++1: ++ vst1.8 {d0[0]}, [r0]! ++ vst1.8 {d4[0]}, [r2]! ++ b 11b ++endfunc ++ ++ ++ ++@ void ff_rpi_sand30_lines_to_planar_y16( ++@ uint8_t * dest, // [r0] ++@ unsigned int dst_stride, // [r1] ++@ const uint8_t * src, // [r2] ++@ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++@ unsigned int src_stride2, // [sp, #0] -> r3 ++@ unsigned int _x, // [sp, #4] Ignored - 0 ++@ unsigned int y, // [sp, #8] (r7 in prefix) ++@ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++@ unsigned int h); // [sp, #16] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand30_lines_to_planar_y16, export=1 ++ push {r4-r8, lr} @ +24 ++ ldr r3, [sp, #24] ++ ldr r6, [sp, #36] ++ ldr r7, [sp, #32] @ y ++ mov r12, #48 ++ sub r3, #1 ++ lsl r3, #7 ++ sub r1, r1, r6, lsl #1 ++ add r8, r2, r7, lsl #7 ++ ldr r7, [sp, #40] ++ ++10: ++ mov r2, r8 ++ add r4, r0, #24 ++ mov r5, r6 ++ mov lr, #0 ++1: ++ vldm r2!, {q10-q13} ++ add lr, #64 ++ ++ vshrn.u32 d4 , q10, #14 @ Cannot vshrn.u32 #20! ++ ands lr, #127 ++ vshrn.u32 d2, q10, #10 ++ vmovn.u32 d0, q10 ++ ++ vshrn.u32 d5, q11, #14 ++ it eq ++ addeq r2, r3 ++ vshrn.u32 d3, q11, #10 ++ vmovn.u32 d1, q11 ++ ++ subs r5, #48 ++ vshr.u16 q2, #6 ++ vbic.u16 q0, #0xfc00 ++ vbic.u16 q1, #0xfc00 ++ ++ vshrn.u32 d20, q12, #14 ++ vshrn.u32 d18, q12, #10 ++ vmovn.u32 d16, q12 ++ ++ vshrn.u32 d21, q13, #14 ++ vshrn.u32 d19, q13, #10 ++ vmovn.u32 d17, q13 ++ ++ vshr.u16 q10, #6 ++ vbic.u16 q8, #0xfc00 ++ vbic.u16 q9 , #0xfc00 ++ blt 2f ++ ++ vst3.16 {d0, d2, d4}, [r0], r12 ++ vst3.16 {d1, d3, d5}, [r4], r12 ++ vst3.16 {d16, d18, d20}, [r0], r12 ++ vst3.16 {d17, d19, d21}, [r4], r12 ++ ++ bne 1b ++ ++11: ++ subs r7, #1 ++ add r0, r1 ++ add r8, #128 ++ bne 10b ++ ++ pop {r4-r8, pc} ++ ++@ Partial final write ++2: ++ cmp r5, #24-48 ++ blt 1f ++ vst3.16 {d0, d2, d4}, [r0], r12 ++ vst3.16 {d1, d3, d5}, [r4] ++ beq 11b ++ vmov q0, q8 ++ sub r5, #24 ++ vmov q1, q9 ++ vmov q2, q10 ++1: ++ cmp r5, #12-48 ++ blt 1f ++ vst3.16 {d0, d2, d4}, [r0]! ++ beq 11b ++ vmov d0, d1 ++ sub r5, #12 ++ vmov d2, d3 ++ vmov d4, d5 ++1: ++ cmp r5, #6-48 ++ add r4, r0, #6 @ avoid [r0]! on sequential instructions ++ blt 1f ++ vst3.16 {d0[0], d2[0], d4[0]}, [r0] ++ vst3.16 {d0[1], d2[1], d4[1]}, [r4] ++ add r0, #12 ++ beq 11b ++ vmov s0, s1 ++ sub r5, #6 ++ vmov s4, s5 ++ vmov s8, s9 ++1: ++ cmp r5, #3-48 ++ blt 1f ++ vst3.16 {d0[0], d2[0], d4[0]}, [r0]! ++ beq 11b ++ sub r5, #3 ++ vshr.u32 d0, #16 ++ vshr.u32 d2, #16 ++1: ++ cmp r5, #2-48 ++ blt 1f ++ vst2.16 {d0[0], d2[0]}, [r0]! ++ b 11b ++1: ++ vst1.16 {d0[0]}, [r0]! ++ b 11b ++ ++endfunc ++ ++ ++@ void ff_rpi_sand30_lines_to_planar_c16( ++@ uint8_t * dst_u, // [r0] ++@ unsigned int dst_stride_u, // [r1] ++@ uint8_t * dst_v, // [r2] ++@ unsigned int dst_stride_v, // [r3] ++@ const uint8_t * src, // [sp, #0] -> r4, r5 ++@ unsigned int stride1, // [sp, #4] 128 ++@ unsigned int stride2, // [sp, #8] -> r8 ++@ unsigned int _x, // [sp, #12] 0 ++@ unsigned int y, // [sp, #16] (r7 in prefix) ++@ unsigned int _w, // [sp, #20] -> r6, r9 ++@ unsigned int h); // [sp, #24] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand30_lines_to_planar_c16, export=1 ++ push {r4-r10, lr} @ +32 ++ ldr r5, [sp, #32] ++ ldr r8, [sp, #40] ++ ldr r7, [sp, #48] ++ ldr r9, [sp, #52] ++ mov r12, #48 ++ sub r8, #1 ++ lsl r8, #7 ++ add r5, r5, r7, lsl #7 ++ sub r1, r1, r9, lsl #1 ++ sub r3, r3, r9, lsl #1 ++ ldr r7, [sp, #56] ++10: ++ mov lr, #0 ++ mov r4, r5 ++ mov r6, r9 ++1: ++ vldm r4!, {q0-q3} ++ add lr, #64 ++ ++ @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2 ++ vshrn.u32 d20, q0, #14 ++ vmovn.u32 d18, q0 ++ vshrn.u32 d0, q0, #10 ++ ands lr, #127 ++ ++ vshrn.u32 d21, q1, #14 ++ vmovn.u32 d19, q1 ++ vshrn.u32 d1, q1, #10 ++ ++ vshrn.u32 d22, q2, #10 ++ vmovn.u32 d2, q2 ++ vshrn.u32 d4, q2, #14 ++ ++ add r10, r0, #24 ++ vshrn.u32 d23, q3, #10 ++ vmovn.u32 d3, q3 ++ vshrn.u32 d5, q3, #14 ++ ++ it eq ++ addeq r4, r8 ++ vuzp.16 q0, q11 ++ vuzp.16 q9, q1 ++ vuzp.16 q10, q2 ++ ++ @ q0 V0, V3,.. ++ @ q9 U0, U3... ++ @ q10 U1, U4... ++ @ q11 U2, U5,.. ++ @ q1 V1, V4, ++ @ q2 V2, V5,.. ++ ++ subs r6, #24 ++ vbic.u16 q11, #0xfc00 ++ vbic.u16 q9, #0xfc00 ++ vshr.u16 q10, #6 ++ vshr.u16 q2, #6 ++ vbic.u16 q0, #0xfc00 ++ vbic.u16 q1, #0xfc00 ++ ++ blt 2f ++ ++ vst3.16 {d18, d20, d22}, [r0], r12 ++ vst3.16 {d19, d21, d23}, [r10] ++ add r10, r2, #24 ++ vst3.16 {d0, d2, d4}, [r2], r12 ++ vst3.16 {d1, d3, d5}, [r10] ++ ++ bne 1b ++ ++11: ++ subs r7, #1 ++ add r5, #128 ++ add r0, r1 ++ add r2, r3 ++ bne 10b ++ ++ pop {r4-r10, pc} ++ ++@ Partial final write ++2: ++ cmp r6, #-12 ++ blt 1f ++ vst3.16 {d18, d20, d22}, [r0]! ++ vst3.16 {d0, d2, d4}, [r2]! ++ beq 11b ++ vmov d18, d19 ++ vmov d20, d21 ++ vmov d22, d23 ++ sub r6, #12 ++ vmov d0, d1 ++ vmov d2, d3 ++ vmov d4, d5 ++1: ++ cmp r6, #-18 ++ @ Rezip here as it makes the remaining tail handling easier ++ vzip.16 d0, d18 ++ vzip.16 d2, d20 ++ vzip.16 d4, d22 ++ blt 1f ++ vst3.16 {d0[1], d2[1], d4[1]}, [r0]! ++ vst3.16 {d0[0], d2[0], d4[0]}, [r2]! ++ vst3.16 {d0[3], d2[3], d4[3]}, [r0]! ++ vst3.16 {d0[2], d2[2], d4[2]}, [r2]! ++ beq 11b ++ vmov d0, d18 ++ vmov d2, d20 ++ sub r6, #6 ++ vmov d4, d22 ++1: ++ cmp r6, #-21 ++ blt 1f ++ vst3.16 {d0[1], d2[1], d4[1]}, [r0]! ++ vst3.16 {d0[0], d2[0], d4[0]}, [r2]! ++ beq 11b ++ vmov s4, s5 ++ sub r6, #3 ++ vmov s0, s1 ++1: ++ cmp r6, #-22 ++ blt 1f ++ vst2.16 {d0[1], d2[1]}, [r0]! ++ vst2.16 {d0[0], d2[0]}, [r2]! ++ b 11b ++1: ++ vst1.16 {d0[1]}, [r0]! ++ vst1.16 {d0[0]}, [r2]! ++ b 11b ++ ++endfunc ++ ++@ void ff_rpi_sand30_lines_to_planar_p010( ++@ uint8_t * dest, // [r0] ++@ unsigned int dst_stride, // [r1] ++@ const uint8_t * src, // [r2] ++@ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++@ unsigned int src_stride2, // [sp, #0] -> r3 ++@ unsigned int _x, // [sp, #4] Ignored - 0 ++@ unsigned int y, // [sp, #8] (r7 in prefix) ++@ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++@ unsigned int h); // [sp, #16] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand30_lines_to_planar_p010, export=1 ++ push {r4-r8, lr} @ +24 ++ ldr r3, [sp, #24] ++ ldr r6, [sp, #36] ++ ldr r7, [sp, #32] @ y ++ mov r12, #48 ++ vmov.u16 q15, #0xffc0 ++ sub r3, #1 ++ lsl r3, #7 ++ sub r1, r1, r6, lsl #1 ++ add r8, r2, r7, lsl #7 ++ ldr r7, [sp, #40] ++ ++10: ++ mov r2, r8 ++ add r4, r0, #24 ++ mov r5, r6 ++ mov lr, #0 ++1: ++ vldm r2!, {q10-q13} ++ add lr, #64 ++ ++ vshl.u32 q14, q10, #6 ++ ands lr, #127 ++ vshrn.u32 d4, q10, #14 ++ vshrn.u32 d2, q10, #4 ++ vmovn.u32 d0, q14 ++ ++ vshl.u32 q14, q11, #6 ++ it eq ++ addeq r2, r3 ++ vshrn.u32 d5, q11, #14 ++ vshrn.u32 d3, q11, #4 ++ vmovn.u32 d1, q14 ++ ++ subs r5, #48 ++ vand q2, q15 ++ vand q1, q15 ++ vand q0, q15 ++ ++ vshl.u32 q14, q12, #6 ++ vshrn.u32 d20, q12, #14 ++ vshrn.u32 d18, q12, #4 ++ vmovn.u32 d16, q14 ++ ++ vshl.u32 q14, q13, #6 ++ vshrn.u32 d21, q13, #14 ++ vshrn.u32 d19, q13, #4 ++ vmovn.u32 d17, q14 ++ ++ vand q10, q15 ++ vand q9, q15 ++ vand q8, q15 ++ blt 2f ++ ++ vst3.16 {d0, d2, d4}, [r0], r12 ++ vst3.16 {d1, d3, d5}, [r4], r12 ++ vst3.16 {d16, d18, d20}, [r0], r12 ++ vst3.16 {d17, d19, d21}, [r4], r12 ++ ++ bne 1b ++ ++11: ++ subs r7, #1 ++ add r0, r1 ++ add r8, #128 ++ bne 10b ++ ++ pop {r4-r8, pc} ++ ++@ Partial final write ++2: ++ cmp r5, #24-48 ++ blt 1f ++ vst3.16 {d0, d2, d4}, [r0], r12 ++ vst3.16 {d1, d3, d5}, [r4] ++ beq 11b ++ vmov q0, q8 ++ sub r5, #24 ++ vmov q1, q9 ++ vmov q2, q10 ++1: ++ cmp r5, #12-48 ++ blt 1f ++ vst3.16 {d0, d2, d4}, [r0]! ++ beq 11b ++ vmov d0, d1 ++ sub r5, #12 ++ vmov d2, d3 ++ vmov d4, d5 ++1: ++ cmp r5, #6-48 ++ add r4, r0, #6 @ avoid [r0]! on sequential instructions ++ blt 1f ++ vst3.16 {d0[0], d2[0], d4[0]}, [r0] ++ vst3.16 {d0[1], d2[1], d4[1]}, [r4] ++ add r0, #12 ++ beq 11b ++ vmov s0, s1 ++ sub r5, #6 ++ vmov s4, s5 ++ vmov s8, s9 ++1: ++ cmp r5, #3-48 ++ blt 1f ++ vst3.16 {d0[0], d2[0], d4[0]}, [r0]! ++ beq 11b ++ sub r5, #3 ++ vshr.u32 d0, #16 ++ vshr.u32 d2, #16 ++1: ++ cmp r5, #2-48 ++ blt 1f ++ vst2.16 {d0[0], d2[0]}, [r0]! ++ b 11b ++1: ++ vst1.16 {d0[0]}, [r0]! ++ b 11b ++ ++endfunc ++ ++ ++@ void ff_rpi_sand30_lines_to_planar_y8( ++@ uint8_t * dest, // [r0] ++@ unsigned int dst_stride, // [r1] ++@ const uint8_t * src, // [r2] ++@ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++@ unsigned int src_stride2, // [sp, #0] -> r3 ++@ unsigned int _x, // [sp, #4] Ignored - 0 ++@ unsigned int y, // [sp, #8] (r7 in prefix) ++@ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++@ unsigned int h); // [sp, #16] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for wri ++ ++function ff_rpi_sand30_lines_to_planar_y8, export=1 ++ push {r4-r8, lr} @ +24 ++ ldr r3, [sp, #24] ++ ldr r6, [sp, #36] ++ ldr r7, [sp, #32] @ y ++ mov r12, #48 ++ lsl r3, #7 ++ sub r1, r1, r6 ++ add r8, r2, r7, lsl #7 ++ ldr r7, [sp, #40] ++ ++10: ++ mov r2, r8 ++ add r4, r0, #24 ++ mov r5, r6 ++1: ++ vldm r2, {q8-q15} ++ ++ subs r5, #96 ++ ++ vmovn.u32 d0, q8 ++ vshrn.u32 d2, q8, #12 ++ vshrn.u32 d4, q8, #16 @ Cannot vshrn.u32 #20! ++ ++ add r2, r3 ++ ++ vmovn.u32 d1, q9 ++ vshrn.u32 d3, q9, #12 ++ vshrn.u32 d5, q9, #16 ++ ++ pld [r2, #0] ++ ++ vshrn.u16 d0, q0, #2 ++ vmovn.u16 d1, q1 ++ vshrn.u16 d2, q2, #6 ++ ++ vmovn.u32 d16, q10 ++ vshrn.u32 d18, q10, #12 ++ vshrn.u32 d20, q10, #16 ++ ++ vmovn.u32 d17, q11 ++ vshrn.u32 d19, q11, #12 ++ vshrn.u32 d21, q11, #16 ++ ++ pld [r2, #64] ++ ++ vshrn.u16 d4, q8, #2 ++ vmovn.u16 d5, q9 ++ vshrn.u16 d6, q10, #6 ++ ++ vmovn.u32 d16, q12 ++ vshrn.u32 d18, q12, #12 ++ vshrn.u32 d20, q12, #16 ++ ++ vmovn.u32 d17, q13 ++ vshrn.u32 d19, q13, #12 ++ vshrn.u32 d21, q13, #16 ++ ++ vshrn.u16 d16, q8, #2 ++ vmovn.u16 d17, q9 ++ vshrn.u16 d18, q10, #6 ++ ++ vmovn.u32 d20, q14 ++ vshrn.u32 d22, q14, #12 ++ vshrn.u32 d24, q14, #16 ++ ++ vmovn.u32 d21, q15 ++ vshrn.u32 d23, q15, #12 ++ vshrn.u32 d25, q15, #16 ++ ++ vshrn.u16 d20, q10, #2 ++ vmovn.u16 d21, q11 ++ vshrn.u16 d22, q12, #6 ++ ++ blt 2f ++ ++ vst3.8 {d0, d1, d2}, [r0], r12 ++ vst3.8 {d4, d5, d6}, [r4], r12 ++ vst3.8 {d16, d17, d18}, [r0], r12 ++ vst3.8 {d20, d21, d22}, [r4], r12 ++ ++ bne 1b ++ ++11: ++ subs r7, #1 ++ add r0, r1 ++ add r8, #128 ++ bne 10b ++ ++ pop {r4-r8, pc} ++ ++@ Partial final write ++2: ++ cmp r5, #48-96 ++ blt 1f ++ vst3.8 {d0, d1, d2}, [r0], r12 ++ vst3.8 {d4, d5, d6}, [r4], r12 ++ beq 11b ++ vmov q0, q8 ++ vmov q2, q10 ++ sub r5, #48 ++ vmov d2, d18 ++ vmov d6, d22 ++1: ++ cmp r5, #24-96 ++ blt 1f ++ vst3.8 {d0, d1, d2}, [r0]! ++ beq 11b ++ vmov q0, q2 ++ sub r5, #24 ++ vmov d2, d6 ++1: ++ cmp r5, #12-96 ++ blt 1f ++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! ++ vst3.8 {d0[1], d1[1], d2[1]}, [r0]! ++ vst3.8 {d0[2], d1[2], d2[2]}, [r0]! ++ vst3.8 {d0[3], d1[3], d2[3]}, [r0]! ++ beq 11b ++ vmov s0, s1 ++ sub r5, #12 ++ vmov s2, s3 ++ vmov s4, s5 ++1: ++ cmp r5, #6-96 ++ blt 1f ++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! ++ vst3.8 {d0[1], d1[1], d2[1]}, [r0]! ++ add r0, #12 ++ beq 11b ++ vshr.u32 d0, #16 ++ sub r5, #6 ++ vshr.u32 d1, #16 ++ vshr.u32 d2, #16 ++1: ++ cmp r5, #3-96 ++ blt 1f ++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! ++ beq 11b ++ sub r5, #3 ++ vshr.u32 d0, #8 ++ vshr.u32 d1, #8 ++1: ++ cmp r5, #2-96 ++ blt 1f ++ vst2.8 {d0[0], d1[0]}, [r0]! ++ b 11b ++1: ++ vst1.8 {d0[0]}, [r0]! ++ b 11b ++ ++endfunc ++ ++ +diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h +new file mode 100644 +index 0000000000..d457c10870 +--- /dev/null ++++ b/libavutil/arm/rpi_sand_neon.h +@@ -0,0 +1,110 @@ ++/* ++Copyright (c) 2020 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++#ifndef AVUTIL_ARM_SAND_NEON_H ++#define AVUTIL_ARM_SAND_NEON_H ++ ++void ff_rpi_sand128b_stripe_to_8_10( ++ uint8_t * dest, // [r0] ++ const uint8_t * src1, // [r1] ++ const uint8_t * src2, // [r2] ++ unsigned int lines); // [r3] ++ ++void ff_rpi_sand8_lines_to_planar_y8( ++ uint8_t * dest, // [r0] ++ unsigned int dst_stride, // [r1] ++ const uint8_t * src, // [r2] ++ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++ unsigned int src_stride2, // [sp, #0] -> r3 ++ unsigned int _x, // [sp, #4] Ignored - 0 ++ unsigned int y, // [sp, #8] (r7 in prefix) ++ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++ unsigned int h); // [sp, #16] -> r7 ++ ++void ff_rpi_sand8_lines_to_planar_c8( ++ uint8_t * dst_u, // [r0] ++ unsigned int dst_stride_u, // [r1] ++ uint8_t * dst_v, // [r2] ++ unsigned int dst_stride_v, // [r3] ++ const uint8_t * src, // [sp, #0] -> r4, r5 ++ unsigned int stride1, // [sp, #4] 128 ++ unsigned int stride2, // [sp, #8] -> r8 ++ unsigned int _x, // [sp, #12] 0 ++ unsigned int y, // [sp, #16] (r7 in prefix) ++ unsigned int _w, // [sp, #20] -> r12, r6 ++ unsigned int h); // [sp, #24] -> r7 ++ ++void ff_rpi_sand30_lines_to_planar_y16( ++ uint8_t * dest, // [r0] ++ unsigned int dst_stride, // [r1] ++ const uint8_t * src, // [r2] ++ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++ unsigned int src_stride2, // [sp, #0] -> r3 ++ unsigned int _x, // [sp, #4] Ignored - 0 ++ unsigned int y, // [sp, #8] (r7 in prefix) ++ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++ unsigned int h); // [sp, #16] -> r7 ++ ++void ff_rpi_sand30_lines_to_planar_c16( ++ uint8_t * dst_u, // [r0] ++ unsigned int dst_stride_u, // [r1] ++ uint8_t * dst_v, // [r2] ++ unsigned int dst_stride_v, // [r3] ++ const uint8_t * src, // [sp, #0] -> r4, r5 ++ unsigned int stride1, // [sp, #4] 128 ++ unsigned int stride2, // [sp, #8] -> r8 ++ unsigned int _x, // [sp, #12] 0 ++ unsigned int y, // [sp, #16] (r7 in prefix) ++ unsigned int _w, // [sp, #20] -> r6, r9 ++ unsigned int h); // [sp, #24] -> r7 ++ ++void ff_rpi_sand30_lines_to_planar_p010( ++ uint8_t * dest, // [r0] ++ unsigned int dst_stride, // [r1] ++ const uint8_t * src, // [r2] ++ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++ unsigned int src_stride2, // [sp, #0] -> r3 ++ unsigned int _x, // [sp, #4] Ignored - 0 ++ unsigned int y, // [sp, #8] (r7 in prefix) ++ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++ unsigned int h); // [sp, #16] -> r7 ++ ++void ff_rpi_sand30_lines_to_planar_y8( ++ uint8_t * dest, // [r0] ++ unsigned int dst_stride, // [r1] ++ const uint8_t * src, // [r2] ++ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++ unsigned int src_stride2, // [sp, #0] -> r3 ++ unsigned int _x, // [sp, #4] Ignored - 0 ++ unsigned int y, // [sp, #8] (r7 in prefix) ++ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++ unsigned int h); // [sp, #16] -> r7 ++ ++#endif // AVUTIL_ARM_SAND_NEON_H ++ +diff --git a/libavutil/frame.c b/libavutil/frame.c +index 75e347bf2f..daa6477485 100644 +--- a/libavutil/frame.c ++++ b/libavutil/frame.c +@@ -16,6 +16,8 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + ++#include "config.h" ++ + #include "channel_layout.h" + #include "avassert.h" + #include "buffer.h" +@@ -26,6 +28,9 @@ + #include "mem.h" + #include "samplefmt.h" + #include "hwcontext.h" ++#if CONFIG_SAND ++#include "rpi_sand_fns.h" ++#endif + + #if FF_API_FRAME_GET_SET + MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp) +@@ -903,6 +908,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags) + (frame->crop_top + frame->crop_bottom) >= frame->height) + return AVERROR(ERANGE); + ++#if CONFIG_SAND ++ // Sand cannot be cropped - do not try ++ if (av_rpi_is_sand_format(frame->format)) ++ return 0; ++#endif ++ + desc = av_pix_fmt_desc_get(frame->format); + if (!desc) + return AVERROR_BUG; +diff --git a/libavutil/frame.h b/libavutil/frame.h +index 7d1f8e2935..a4e7dc915d 100644 +--- a/libavutil/frame.h ++++ b/libavutil/frame.h +@@ -990,6 +990,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags); + */ + const char *av_frame_side_data_name(enum AVFrameSideDataType type); + ++ ++static inline int av_frame_cropped_width(const AVFrame * const frame) ++{ ++ return frame->width - (frame->crop_left + frame->crop_right); ++} ++static inline int av_frame_cropped_height(const AVFrame * const frame) ++{ ++ return frame->height - (frame->crop_top + frame->crop_bottom); ++} ++ + /** + * @} + */ +diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c +index 7a9fdbd263..2f825b7e16 100644 +--- a/libavutil/hwcontext_drm.c ++++ b/libavutil/hwcontext_drm.c +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + /* This was introduced in version 4.6. And may not exist all without an + * optional package. So to prevent a hard dependency on needing the Linux +@@ -31,6 +32,7 @@ + #endif + + #include ++#include + #include + + #include "avassert.h" +@@ -38,7 +40,9 @@ + #include "hwcontext_drm.h" + #include "hwcontext_internal.h" + #include "imgutils.h" +- ++#if CONFIG_SAND ++#include "libavutil/rpi_sand_fns.h" ++#endif + + static void drm_device_free(AVHWDeviceContext *hwdev) + { +@@ -53,6 +57,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device, + AVDRMDeviceContext *hwctx = hwdev->hwctx; + drmVersionPtr version; + ++ if (device == NULL) { ++ hwctx->fd = -1; ++ return 0; ++ } ++ + hwctx->fd = open(device, O_RDWR); + if (hwctx->fd < 0) + return AVERROR(errno); +@@ -139,6 +148,8 @@ static int drm_map_frame(AVHWFramesContext *hwfc, + if (flags & AV_HWFRAME_MAP_WRITE) + mmap_prot |= PROT_WRITE; + ++ if (dst->format == AV_PIX_FMT_NONE) ++ dst->format = hwfc->sw_format; + #if HAVE_LINUX_DMA_BUF_H + if (flags & AV_HWFRAME_MAP_READ) + map->sync_flags |= DMA_BUF_SYNC_READ; +@@ -185,6 +196,23 @@ static int drm_map_frame(AVHWFramesContext *hwfc, + + dst->width = src->width; + dst->height = src->height; ++ dst->crop_top = src->crop_top; ++ dst->crop_bottom = src->crop_bottom; ++ dst->crop_left = src->crop_left; ++ dst->crop_right = src->crop_right; ++ ++#if CONFIG_SAND ++ // Rework for sand frames ++ if (av_rpi_is_sand_frame(dst)) { ++ // As it stands the sand formats hold stride2 in linesize[3] ++ // linesize[0] & [1] contain stride1 which is always 128 for everything we do ++ // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1] ++ dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier); ++ dst->linesize[0] = 128; ++ dst->linesize[1] = 128; ++ // *** Are we sure src->height is actually what we want ??? ++ } ++#endif + + err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src, + &drm_unmap_frame, map); +@@ -206,16 +234,29 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx, + enum AVHWFrameTransferDirection dir, + enum AVPixelFormat **formats) + { +- enum AVPixelFormat *pix_fmts; ++ enum AVPixelFormat *p; + +- pix_fmts = av_malloc_array(2, sizeof(*pix_fmts)); +- if (!pix_fmts) ++ p = *formats = av_malloc_array(3, sizeof(*p)); ++ if (!p) + return AVERROR(ENOMEM); + +- pix_fmts[0] = ctx->sw_format; +- pix_fmts[1] = AV_PIX_FMT_NONE; ++ // **** Offer native sand too ???? ++ *p++ = ++#if CONFIG_SAND ++ ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ? ++ AV_PIX_FMT_YUV420P : ++ ctx->sw_format == AV_PIX_FMT_RPI4_10 ? ++ AV_PIX_FMT_YUV420P10LE : ++#endif ++ ctx->sw_format; ++ ++#if CONFIG_SAND ++ if (ctx->sw_format == AV_PIX_FMT_RPI4_10 || ++ ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128) ++ *p++ = AV_PIX_FMT_NV12; ++#endif + +- *formats = pix_fmts; ++ *p = AV_PIX_FMT_NONE; + return 0; + } + +@@ -231,18 +272,63 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc, + map = av_frame_alloc(); + if (!map) + return AVERROR(ENOMEM); +- map->format = dst->format; + ++ // Map to default ++ map->format = AV_PIX_FMT_NONE; + err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ); + if (err) + goto fail; + +- map->width = dst->width; +- map->height = dst->height; ++#if 0 ++ av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__, ++ hwfc->sw_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE, ++ map->width, map->height, ++ map->linesize[0], ++ map->linesize[1], ++ map->linesize[2], ++ map->linesize[3], ++ dst->width, dst->height, ++ dst->linesize[0], ++ dst->linesize[1], ++ dst->linesize[2]); ++#endif ++#if CONFIG_SAND ++ if (av_rpi_is_sand_frame(map)) { ++ // Preserve crop - later ffmpeg code assumes that we have in that it ++ // overwrites any crop that we create with the old values ++ unsigned int stride2 = map->linesize[3]; ++ const unsigned int w = FFMIN(dst->width, map->width); ++ const unsigned int h = FFMIN(dst->height, map->height); ++ ++ map->crop_top = 0; ++ map->crop_bottom = 0; ++ map->crop_left = 0; ++ map->crop_right = 0; ++ ++ if (av_rpi_sand_to_planar_frame(dst, map) != 0) ++ { ++ av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__); ++ err = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ dst->width = w; ++ dst->height = h; ++ } ++ else ++#endif ++ { ++ // Kludge mapped h/w s.t. frame_copy works ++ map->width = dst->width; ++ map->height = dst->height; ++ err = av_frame_copy(dst, map); ++ } + +- err = av_frame_copy(dst, map); + if (err) ++ { ++ av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__); + goto fail; ++ } + + err = 0; + fail: +@@ -257,7 +343,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc, + int err; + + if (src->width > hwfc->width || src->height > hwfc->height) ++ { ++ av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height); + return AVERROR(EINVAL); ++ } + + map = av_frame_alloc(); + if (!map) +diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c +index 18c7a0efc8..bab13a4d50 100644 +--- a/libavutil/pixdesc.c ++++ b/libavutil/pixdesc.c +@@ -2395,6 +2395,50 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = { + .name = "vulkan", + .flags = AV_PIX_FMT_FLAG_HWACCEL, + }, ++ [AV_PIX_FMT_SAND128] = { ++ .name = "sand128", ++ .nb_components = 3, ++ .log2_chroma_w = 1, ++ .log2_chroma_h = 1, ++ .comp = { ++ { 0, 1, 0, 0, 8, 0, 7, 1 }, /* Y */ ++ { 1, 2, 0, 0, 8, 1, 7, 1 }, /* U */ ++ { 1, 2, 1, 0, 8, 1, 7, 2 }, /* V */ ++ }, ++ .flags = 0, ++ }, ++ [AV_PIX_FMT_SAND64_10] = { ++ .name = "sand64_10", ++ .nb_components = 3, ++ .log2_chroma_w = 1, ++ .log2_chroma_h = 1, ++ .comp = { ++ { 0, 2, 0, 0, 10, 0, 9, 1 }, /* Y */ ++ { 1, 4, 0, 0, 10, 3, 9, 1 }, /* U */ ++ { 1, 4, 2, 0, 10, 3, 9, 3 }, /* V */ ++ }, ++ .flags = 0, ++ }, ++ [AV_PIX_FMT_SAND64_16] = { ++ .name = "sand64_16", ++ .nb_components = 3, ++ .log2_chroma_w = 1, ++ .log2_chroma_h = 1, ++ .comp = { ++ { 0, 2, 0, 0, 16, 0, 15, 1 }, /* Y */ ++ { 1, 4, 0, 0, 16, 3, 15, 1 }, /* U */ ++ { 1, 4, 2, 0, 16, 3, 15, 3 }, /* V */ ++ }, ++ .flags = 0, ++ }, ++ [AV_PIX_FMT_RPI4_8] = { ++ .name = "rpi4_8", ++ .flags = AV_PIX_FMT_FLAG_HWACCEL, ++ }, ++ [AV_PIX_FMT_RPI4_10] = { ++ .name = "rpi4_10", ++ .flags = AV_PIX_FMT_FLAG_HWACCEL, ++ }, + }; + #if FF_API_PLUS1_MINUS1 + FF_ENABLE_DEPRECATION_WARNINGS +diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h +index 46ef211add..9195ead15f 100644 +--- a/libavutil/pixfmt.h ++++ b/libavutil/pixfmt.h +@@ -357,6 +357,14 @@ enum AVPixelFormat { + + AV_PIX_FMT_Y210BE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian + AV_PIX_FMT_Y210LE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian ++// RPI - not on ifdef so can be got at by calling progs ++// #define so code that uses this can know it is there ++#define AVUTIL_HAVE_PIX_FMT_SAND 1 ++ AV_PIX_FMT_SAND128, ///< 4:2:0 8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_SAND64_10, ///< 4:2:0 10-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_RPI4_8, ++ AV_PIX_FMT_RPI4_10, + + AV_PIX_FMT_X2RGB10LE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), little-endian, X=unused/undefined + AV_PIX_FMT_X2RGB10BE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), big-endian, X=unused/undefined +diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h +new file mode 100644 +index 0000000000..0d5d203dc3 +--- /dev/null ++++ b/libavutil/rpi_sand_fn_pw.h +@@ -0,0 +1,227 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++// * Included twice from rpi_sand_fn with different PW ++ ++#define STRCAT(x,y) x##y ++ ++#if PW == 1 ++#define pixel uint8_t ++#define FUNC(f) STRCAT(f, 8) ++#elif PW == 2 ++#define pixel uint16_t ++#define FUNC(f) STRCAT(f, 16) ++#else ++#error Unexpected PW ++#endif ++ ++// Fetches a single patch - offscreen fixup not done here ++// w <= stride1 ++// unclipped ++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x = _x; ++ const unsigned int w = _w; ++ const unsigned int mask = stride1 - 1; ++ ++#if PW == 1 && HAVE_SAND_ASM ++ if (_x == 0) { ++ ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride, ++ src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif ++ ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) { ++ memcpy(dst, p, w); ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); ++ ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const uint8_t * p = p2; ++ uint8_t * d = dst; ++ memcpy(d, p1, w1); ++ d += w1; ++ for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) { ++ memcpy(d, p, stride1); ++ } ++ memcpy(d, p, w3); ++ } ++ } ++} ++ ++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V) ++ ++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x = _x * 2; ++ const unsigned int w = _w * 2; ++ const unsigned int mask = stride1 - 1; ++ ++#if PW == 1 && HAVE_SAND_ASM ++ if (_x == 0) { ++ ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v, ++ src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif ++ ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) { ++ pixel * du = (pixel *)dst_u; ++ pixel * dv = (pixel *)dst_v; ++ const pixel * p = (const pixel *)p1; ++ for (unsigned int k = 0; k < w; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const unsigned int sstride_p = (sstride - stride1) / PW; ++ ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); ++ ++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const pixel * p = (const pixel *)p1; ++ pixel * du = (pixel *)dst_u; ++ pixel * dv = (pixel *)dst_v; ++ for (unsigned int k = 0; k < w1; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) { ++ for (unsigned int k = 0; k < stride1; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ for (unsigned int k = 0; k < w3; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ } ++} ++ ++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x = _x * 2; ++ const unsigned int w = _w * 2; ++ const unsigned int mask = stride1 - 1; ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) { ++ const pixel * su = (const pixel *)src_u; ++ const pixel * sv = (const pixel *)src_v; ++ pixel * p = (pixel *)p1; ++ for (unsigned int k = 0; k < w; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const unsigned int sstride_p = (sstride - stride1) / PW; ++ ++ const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); ++ ++ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const pixel * su = (const pixel *)src_u; ++ const pixel * sv = (const pixel *)src_v; ++ pixel * p = (pixel *)p1; ++ for (unsigned int k = 0; k < w1; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) { ++ for (unsigned int k = 0; k < stride1; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ for (unsigned int k = 0; k < w3; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ } ++} ++ ++ ++#undef pixel ++#undef STRCAT ++#undef FUNC ++ +diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c +new file mode 100644 +index 0000000000..0626bb06cb +--- /dev/null ++++ b/libavutil/rpi_sand_fns.c +@@ -0,0 +1,447 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++#include "config.h" ++#include ++#include ++#include "rpi_sand_fns.h" ++#include "avassert.h" ++#include "frame.h" ++ ++#if ARCH_ARM && HAVE_NEON ++#include "libavutil/arm/cpu.h" ++#include "libavutil/arm/rpi_sand_neon.h" ++#define HAVE_SAND_ASM 1 ++#elif ARCH_AARCH64 && HAVE_NEON ++#include "libavutil/aarch64/cpu.h" ++#include "libavutil/aarch64/rpi_sand_neon.h" ++#define HAVE_SAND_ASM 1 ++#else ++#define HAVE_SAND_ASM 0 ++#endif ++ ++#define PW 1 ++#include "rpi_sand_fn_pw.h" ++#undef PW ++ ++#define PW 2 ++#include "rpi_sand_fn_pw.h" ++#undef PW ++ ++#if 1 ++// Simple round ++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) ++{ ++ const unsigned int rnd = (1 << shr) >> 1; ++ const uint16_t * src = (const uint16_t *)_src; ++ ++ for (; n != 0; --n) { ++ *dst++ = (*src++ + rnd) >> shr; ++ } ++} ++#else ++// Dithered variation ++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) ++{ ++ unsigned int rnd = (1 << shr) >> 1; ++ const unsigned int mask = ((1 << shr) - 1); ++ const uint16_t * src = (const uint16_t *)_src; ++ ++ for (; n != 0; --n) { ++ rnd = *src++ + (rnd & mask); ++ *dst++ = rnd >> shr; ++ } ++} ++#endif ++ ++// Fetches a single patch - offscreen fixup not done here ++// w <= stride1 ++// unclipped ++// _x & _w in pixels, strides in bytes ++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word ++ const unsigned int xskip0 = _x - (x0 >> 2) * 3; ++ const unsigned int x1 = ((_x + _w) / 3) * 4; ++ const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3; ++ const unsigned int mask = stride1 - 1; ++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; ++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words ++ ++#if HAVE_SAND_ASM ++ if (_x == 0 && have_neon(av_get_cpu_flags())) { ++ ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif ++ ++ if (x0 == x1) { ++ // ******************* ++ // Partial single word xfer ++ return; ++ } ++ ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1) ++ { ++ unsigned int x = x0; ++ const uint32_t * p = (const uint32_t *)p0; ++ uint16_t * d = (uint16_t *)dst; ++ ++ if (xskip0 != 0) { ++ const uint32_t p3 = *p++; ++ ++ if (xskip0 == 1) ++ *d++ = (p3 >> 10) & 0x3ff; ++ *d++ = (p3 >> 20) & 0x3ff; ++ ++ if (((x += 4) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ while (x != x1) { ++ const uint32_t p3 = *p++; ++ *d++ = p3 & 0x3ff; ++ *d++ = (p3 >> 10) & 0x3ff; ++ *d++ = (p3 >> 20) & 0x3ff; ++ ++ if (((x += 4) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ if (xrem1 != 0) { ++ const uint32_t p3 = *p; ++ ++ *d++ = p3 & 0x3ff; ++ if (xrem1 == 2) ++ *d++ = (p3 >> 10) & 0x3ff; ++ } ++ } ++} ++ ++ ++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word ++ const unsigned int xskip0 = _x - (x0 >> 3) * 3; ++ const unsigned int x1 = ((_x + _w) / 3) * 8; ++ const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3; ++ const unsigned int mask = stride1 - 1; ++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; ++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words ++ ++#if HAVE_SAND_ASM ++ if (_x == 0 && have_neon(av_get_cpu_flags())) { ++ ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v, ++ src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif ++ ++ if (x0 == x1) { ++ // ******************* ++ // Partial single word xfer ++ return; ++ } ++ ++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1) ++ { ++ unsigned int x = x0; ++ const uint32_t * p = (const uint32_t *)p0; ++ uint16_t * du = (uint16_t *)dst_u; ++ uint16_t * dv = (uint16_t *)dst_v; ++ ++ if (xskip0 != 0) { ++ const uint32_t p3a = *p++; ++ const uint32_t p3b = *p++; ++ ++ if (xskip0 == 1) ++ { ++ *du++ = (p3a >> 20) & 0x3ff; ++ *dv++ = (p3b >> 0) & 0x3ff; ++ } ++ *du++ = (p3b >> 10) & 0x3ff; ++ *dv++ = (p3b >> 20) & 0x3ff; ++ ++ if (((x += 8) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ while (x != x1) { ++ const uint32_t p3a = *p++; ++ const uint32_t p3b = *p++; ++ ++ *du++ = p3a & 0x3ff; ++ *dv++ = (p3a >> 10) & 0x3ff; ++ *du++ = (p3a >> 20) & 0x3ff; ++ *dv++ = p3b & 0x3ff; ++ *du++ = (p3b >> 10) & 0x3ff; ++ *dv++ = (p3b >> 20) & 0x3ff; ++ ++ if (((x += 8) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ if (xrem1 != 0) { ++ const uint32_t p3a = *p++; ++ const uint32_t p3b = *p++; ++ ++ *du++ = p3a & 0x3ff; ++ *dv++ = (p3a >> 10) & 0x3ff; ++ if (xrem1 == 2) ++ { ++ *du++ = (p3a >> 20) & 0x3ff; ++ *dv++ = p3b & 0x3ff; ++ } ++ } ++ } ++} ++ ++// Fetches a single patch - offscreen fixup not done here ++// w <= stride1 ++// single lose bottom 2 bits truncation ++// _x & _w in pixels, strides in bytes ++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word ++ const unsigned int xskip0 = _x - (x0 >> 2) * 3; ++ const unsigned int x1 = ((_x + _w) / 3) * 4; ++ const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3; ++ const unsigned int mask = stride1 - 1; ++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; ++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words ++ ++#if HAVE_SAND_ASM ++ if (_x == 0) { ++ ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif ++ ++ if (x0 == x1) { ++ // ******************* ++ // Partial single word xfer ++ return; ++ } ++ ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1) ++ { ++ unsigned int x = x0; ++ const uint32_t * p = (const uint32_t *)p0; ++ uint8_t * d = dst; ++ ++ if (xskip0 != 0) { ++ const uint32_t p3 = *p++; ++ ++ if (xskip0 == 1) ++ *d++ = (p3 >> 12) & 0xff; ++ *d++ = (p3 >> 22) & 0xff; ++ ++ if (((x += 4) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ while (x != x1) { ++ const uint32_t p3 = *p++; ++ *d++ = (p3 >> 2) & 0xff; ++ *d++ = (p3 >> 12) & 0xff; ++ *d++ = (p3 >> 22) & 0xff; ++ ++ if (((x += 4) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ if (xrem1 != 0) { ++ const uint32_t p3 = *p; ++ ++ *d++ = (p3 >> 2) & 0xff; ++ if (xrem1 == 2) ++ *d++ = (p3 >> 12) & 0xff; ++ } ++ } ++} ++ ++ ++ ++// w/h in pixels ++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, ++ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, ++ unsigned int w, unsigned int h, const unsigned int shr) ++{ ++ const unsigned int n = dst_stride1 / 2; ++ unsigned int j; ++ ++ // This is true for our current layouts ++ av_assert0(dst_stride1 == src_stride1); ++ ++ // As we have the same stride1 for src & dest and src is wider than dest ++ // then if we loop on src we can always write contiguously to dest ++ // We make no effort to copy an exact width - round up to nearest src stripe ++ // as we will always have storage in dest for that ++ ++#if ARCH_ARM && HAVE_NEON ++ if (shr == 3 && src_stride1 == 128) { ++ for (j = 0; j + n < w; j += dst_stride1) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ const uint8_t * s2 = s1 + src_stride1 * src_stride2; ++ ++ ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h); ++ } ++ } ++ else ++#endif ++ { ++ for (j = 0; j + n < w; j += dst_stride1) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ const uint8_t * s2 = s1 + src_stride1 * src_stride2; ++ ++ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) { ++ cpy16_to_8(d, s1, n, shr); ++ cpy16_to_8(d + n, s2, n, shr); ++ } ++ } ++ } ++ ++ // Fix up a trailing dest half stripe ++ if (j < w) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ ++ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) { ++ cpy16_to_8(d, s1, n, shr); ++ } ++ } ++} ++ ++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src) ++{ ++ const int w = av_frame_cropped_width(src); ++ const int h = av_frame_cropped_height(src); ++ const int x = src->crop_left; ++ const int y = src->crop_top; ++ ++ // We will crop as part of the conversion ++ dst->crop_top = 0; ++ dst->crop_left = 0; ++ dst->crop_bottom = 0; ++ dst->crop_right = 0; ++ ++ switch (src->format){ ++ case AV_PIX_FMT_SAND128: ++ case AV_PIX_FMT_RPI4_8: ++ switch (dst->format){ ++ case AV_PIX_FMT_YUV420P: ++ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y, w, h); ++ av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x/2, y/2, w/2, h/2); ++ break; ++ case AV_PIX_FMT_NV12: ++ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y, w, h); ++ av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x/2, y/2, w, h/2); ++ break; ++ default: ++ return -1; ++ } ++ break; ++ case AV_PIX_FMT_SAND64_10: ++ switch (dst->format){ ++ case AV_PIX_FMT_YUV420P10: ++ av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x*2, y, w*2, h); ++ av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y/2, w, h/2); ++ break; ++ default: ++ return -1; ++ } ++ break; ++ case AV_PIX_FMT_RPI4_10: ++ switch (dst->format){ ++ case AV_PIX_FMT_YUV420P10: ++ av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y, w, h); ++ av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x/2, y/2, w/2, h/2); ++ break; ++ case AV_PIX_FMT_NV12: ++ av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y, w, h); ++ av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x/2, y/2, w, h/2); ++ break; ++ default: ++ return -1; ++ } ++ break; ++ default: ++ return -1; ++ } ++ ++ return av_frame_copy_props(dst, src); ++} +diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h +new file mode 100644 +index 0000000000..462ccb8abd +--- /dev/null ++++ b/libavutil/rpi_sand_fns.h +@@ -0,0 +1,188 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++#ifndef AVUTIL_RPI_SAND_FNS ++#define AVUTIL_RPI_SAND_FNS ++ ++#include "libavutil/frame.h" ++ ++// For all these fns _x & _w are measured as coord * PW ++// For the C fns coords are in chroma pels (so luma / 2) ++// Strides are in bytes ++ ++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void av_rpi_planar_to_sand_c8(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_planar_to_sand_c16(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++// w/h in pixels ++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, ++ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, ++ unsigned int w, unsigned int h, const unsigned int shr); ++ ++ ++// dst must contain required pixel format & allocated data buffers ++// Cropping on the src buffer will be honoured and dst crop will be set to zero ++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src); ++ ++ ++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame) ++{ ++#ifdef RPI_ZC_SAND128_ONLY ++ // If we are sure we only only support 128 byte sand formats replace the ++ // var with a constant which should allow for better optimisation ++ return 128; ++#else ++ return frame->linesize[0]; ++#endif ++} ++ ++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame) ++{ ++ return frame->linesize[3]; ++} ++ ++ ++static inline int av_rpi_is_sand_format(const int format) ++{ ++ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10); ++} ++ ++static inline int av_rpi_is_sand_frame(const AVFrame * const frame) ++{ ++ return av_rpi_is_sand_format(frame->format); ++} ++ ++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame) ++{ ++ return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8); ++} ++ ++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame) ++{ ++ return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16); ++} ++ ++static inline int av_rpi_is_sand30_frame(const AVFrame * const frame) ++{ ++ return (frame->format == AV_PIX_FMT_RPI4_10); ++} ++ ++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame) ++{ ++ return av_rpi_is_sand8_frame(frame) ? 0 : 1; ++} ++ ++// If x is measured in bytes (not pixels) then this works for sand64_16 as ++// well as sand128 - but in the general case we work that out ++ ++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y) ++{ ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame); ++ const unsigned int x1 = x & (stride1 - 1); ++ const unsigned int x2 = x ^ x1; ++ ++ return x1 + stride1 * y + stride2 * x2; ++} ++ ++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c) ++{ ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1); ++ const unsigned int x1 = x & (stride1 - 1); ++ const unsigned int x2 = x ^ x1; ++ ++ return x1 + stride1 * y_c + stride2 * x2; ++} ++ ++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y); ++} ++ ++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y); ++} ++ ++#endif ++ +diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c +index a9bf6ff9e0..6a0e2dcc09 100644 +--- a/libswscale/aarch64/rgb2rgb.c ++++ b/libswscale/aarch64/rgb2rgb.c +@@ -30,6 +30,12 @@ + void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int width, int height, + int src1Stride, int src2Stride, int dstStride); ++void ff_bgr24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv); ++void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv); + + av_cold void rgb2rgb_init_aarch64(void) + { +@@ -37,5 +43,7 @@ av_cold void rgb2rgb_init_aarch64(void) + + if (have_neon(cpu_flags)) { + interleaveBytes = ff_interleave_bytes_neon; ++ ff_rgb24toyv12 = ff_rgb24toyv12_aarch64; ++ ff_bgr24toyv12 = ff_bgr24toyv12_aarch64; + } + } +diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S +index d81110ec57..476ca723a0 100644 +--- a/libswscale/aarch64/rgb2rgb_neon.S ++++ b/libswscale/aarch64/rgb2rgb_neon.S +@@ -77,3 +77,448 @@ function ff_interleave_bytes_neon, export=1 + 0: + ret + endfunc ++ ++// void ff_rgb24toyv12_aarch64( ++// const uint8_t *src, // x0 ++// uint8_t *ydst, // x1 ++// uint8_t *udst, // x2 ++// uint8_t *vdst, // x3 ++// int width, // w4 ++// int height, // w5 ++// int lumStride, // w6 ++// int chromStride, // w7 ++// int srcStr, // [sp, #0] ++// int32_t *rgb2yuv); // [sp, #8] ++ ++function ff_rgb24toyv12_aarch64, export=1 ++ ldr x15, [sp, #8] ++ ld1 {v3.s}[2], [x15], #4 ++ ld1 {v3.s}[1], [x15], #4 ++ ld1 {v3.s}[0], [x15], #4 ++ ld1 {v4.s}[2], [x15], #4 ++ ld1 {v4.s}[1], [x15], #4 ++ ld1 {v4.s}[0], [x15], #4 ++ ld1 {v5.s}[2], [x15], #4 ++ ld1 {v5.s}[1], [x15], #4 ++ ld1 {v5.s}[0], [x15] ++ b 99f ++endfunc ++ ++// void ff_bgr24toyv12_aarch64( ++// const uint8_t *src, // x0 ++// uint8_t *ydst, // x1 ++// uint8_t *udst, // x2 ++// uint8_t *vdst, // x3 ++// int width, // w4 ++// int height, // w5 ++// int lumStride, // w6 ++// int chromStride, // w7 ++// int srcStr, // [sp, #0] ++// int32_t *rgb2yuv); // [sp, #8] ++ ++// regs ++// v0-2 Src bytes - reused as chroma src ++// v3-5 Coeffs (packed very inefficiently - could be squashed) ++// v6 128b ++// v7 128h ++// v8-15 Reserved ++// v16-18 Lo Src expanded as H ++// v19 - ++// v20-22 Hi Src expanded as H ++// v23 - ++// v24 U out ++// v25 U tmp ++// v26 Y out ++// v27-29 Y tmp ++// v30 V out ++// v31 V tmp ++ ++// Assumes Little Endian in tail stores & conversion matrix ++ ++function ff_bgr24toyv12_aarch64, export=1 ++ ldr x15, [sp, #8] ++ ld3 {v3.s, v4.s, v5.s}[0], [x15], #12 ++ ld3 {v3.s, v4.s, v5.s}[1], [x15], #12 ++ ld3 {v3.s, v4.s, v5.s}[2], [x15] ++99: ++ ldr w14, [sp, #0] ++ movi v7.8b, #128 ++ uxtl v6.8h, v7.8b ++ // Ensure if nothing to do then we do nothing ++ cmp w4, #0 ++ b.le 90f ++ cmp w5, #0 ++ b.le 90f ++ // If w % 16 != 0 then -16 so we do main loop 1 fewer times with ++ // the remainder done in the tail ++ tst w4, #15 ++ b.eq 1f ++ sub w4, w4, #16 ++1: ++ ++// -------------------- Even line body - YUV ++11: ++ subs w9, w4, #0 ++ mov x10, x0 ++ mov x11, x1 ++ mov x12, x2 ++ mov x13, x3 ++ b.lt 12f ++ ++ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 ++ subs w9, w9, #16 ++ b.le 13f ++ ++10: ++ uxtl v16.8h, v0.8b ++ uxtl v17.8h, v1.8b ++ uxtl v18.8h, v2.8b ++ ++ uxtl2 v20.8h, v0.16b ++ uxtl2 v21.8h, v1.16b ++ uxtl2 v22.8h, v2.16b ++ ++ bic v0.8h, #0xff, LSL #8 ++ bic v1.8h, #0xff, LSL #8 ++ bic v2.8h, #0xff, LSL #8 ++ ++ // Testing shows it is faster to stack the smull/smlal ops together ++ // rather than interleave them between channels and indeed even the ++ // shift/add sections seem happier not interleaved ++ ++ // Y0 ++ smull v26.4s, v16.4h, v3.h[0] ++ smlal v26.4s, v17.4h, v4.h[0] ++ smlal v26.4s, v18.4h, v5.h[0] ++ smull2 v27.4s, v16.8h, v3.h[0] ++ smlal2 v27.4s, v17.8h, v4.h[0] ++ smlal2 v27.4s, v18.8h, v5.h[0] ++ // Y1 ++ smull v28.4s, v20.4h, v3.h[0] ++ smlal v28.4s, v21.4h, v4.h[0] ++ smlal v28.4s, v22.4h, v5.h[0] ++ smull2 v29.4s, v20.8h, v3.h[0] ++ smlal2 v29.4s, v21.8h, v4.h[0] ++ smlal2 v29.4s, v22.8h, v5.h[0] ++ shrn v26.4h, v26.4s, #12 ++ shrn2 v26.8h, v27.4s, #12 ++ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) ++ sqrshrun v26.8b, v26.8h, #3 ++ shrn v28.4h, v28.4s, #12 ++ shrn2 v28.8h, v29.4s, #12 ++ add v28.8h, v28.8h, v6.8h ++ sqrshrun2 v26.16b, v28.8h, #3 ++ // Y0/Y1 ++ ++ // U ++ // Vector subscript *2 as we loaded into S but are only using H ++ smull v24.4s, v0.4h, v3.h[2] ++ smlal v24.4s, v1.4h, v4.h[2] ++ smlal v24.4s, v2.4h, v5.h[2] ++ smull2 v25.4s, v0.8h, v3.h[2] ++ smlal2 v25.4s, v1.8h, v4.h[2] ++ smlal2 v25.4s, v2.8h, v5.h[2] ++ ++ // V ++ smull v30.4s, v0.4h, v3.h[4] ++ smlal v30.4s, v1.4h, v4.h[4] ++ smlal v30.4s, v2.4h, v5.h[4] ++ smull2 v31.4s, v0.8h, v3.h[4] ++ smlal2 v31.4s, v1.8h, v4.h[4] ++ smlal2 v31.4s, v2.8h, v5.h[4] ++ ++ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 ++ ++ shrn v24.4h, v24.4s, #14 ++ shrn2 v24.8h, v25.4s, #14 ++ sqrshrn v24.8b, v24.8h, #1 ++ add v24.8b, v24.8b, v7.8b // +128 ++ shrn v30.4h, v30.4s, #14 ++ shrn2 v30.8h, v31.4s, #14 ++ sqrshrn v30.8b, v30.8h, #1 ++ add v30.8b, v30.8b, v7.8b // +128 ++ ++ subs w9, w9, #16 ++ ++ st1 {v26.16b}, [x11], #16 ++ st1 {v24.8b}, [x12], #8 ++ st1 {v30.8b}, [x13], #8 ++ ++ b.gt 10b ++ ++// -------------------- Even line tail - YUV ++// If width % 16 == 0 then simply runs once with preloaded RGB ++// If other then deals with preload & then does remaining tail ++ ++13: ++ // Body is simple copy of main loop body minus preload ++ ++ uxtl v16.8h, v0.8b ++ uxtl v17.8h, v1.8b ++ uxtl v18.8h, v2.8b ++ ++ uxtl2 v20.8h, v0.16b ++ uxtl2 v21.8h, v1.16b ++ uxtl2 v22.8h, v2.16b ++ ++ bic v0.8h, #0xff, LSL #8 ++ bic v1.8h, #0xff, LSL #8 ++ bic v2.8h, #0xff, LSL #8 ++ ++ // Y0 ++ smull v26.4s, v16.4h, v3.h[0] ++ smlal v26.4s, v17.4h, v4.h[0] ++ smlal v26.4s, v18.4h, v5.h[0] ++ smull2 v27.4s, v16.8h, v3.h[0] ++ smlal2 v27.4s, v17.8h, v4.h[0] ++ smlal2 v27.4s, v18.8h, v5.h[0] ++ // Y1 ++ smull v28.4s, v20.4h, v3.h[0] ++ smlal v28.4s, v21.4h, v4.h[0] ++ smlal v28.4s, v22.4h, v5.h[0] ++ smull2 v29.4s, v20.8h, v3.h[0] ++ smlal2 v29.4s, v21.8h, v4.h[0] ++ smlal2 v29.4s, v22.8h, v5.h[0] ++ shrn v26.4h, v26.4s, #12 ++ shrn2 v26.8h, v27.4s, #12 ++ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) ++ sqrshrun v26.8b, v26.8h, #3 ++ shrn v28.4h, v28.4s, #12 ++ shrn2 v28.8h, v29.4s, #12 ++ add v28.8h, v28.8h, v6.8h ++ sqrshrun2 v26.16b, v28.8h, #3 ++ // Y0/Y1 ++ ++ // U ++ // Vector subscript *2 as we loaded into S but are only using H ++ smull v24.4s, v0.4h, v3.h[2] ++ smlal v24.4s, v1.4h, v4.h[2] ++ smlal v24.4s, v2.4h, v5.h[2] ++ smull2 v25.4s, v0.8h, v3.h[2] ++ smlal2 v25.4s, v1.8h, v4.h[2] ++ smlal2 v25.4s, v2.8h, v5.h[2] ++ ++ // V ++ smull v30.4s, v0.4h, v3.h[4] ++ smlal v30.4s, v1.4h, v4.h[4] ++ smlal v30.4s, v2.4h, v5.h[4] ++ smull2 v31.4s, v0.8h, v3.h[4] ++ smlal2 v31.4s, v1.8h, v4.h[4] ++ smlal2 v31.4s, v2.8h, v5.h[4] ++ ++ cmp w9, #-16 ++ ++ shrn v24.4h, v24.4s, #14 ++ shrn2 v24.8h, v25.4s, #14 ++ sqrshrn v24.8b, v24.8h, #1 ++ add v24.8b, v24.8b, v7.8b // +128 ++ shrn v30.4h, v30.4s, #14 ++ shrn2 v30.8h, v31.4s, #14 ++ sqrshrn v30.8b, v30.8h, #1 ++ add v30.8b, v30.8b, v7.8b // +128 ++ ++ // Here: ++ // w9 == 0 width % 16 == 0, tail done ++ // w9 > -16 1st tail done (16 pels), remainder still to go ++ // w9 == -16 shouldn't happen ++ // w9 > -32 2nd tail done ++ // w9 <= -32 shouldn't happen ++ ++ b.lt 2f ++ st1 {v26.16b}, [x11], #16 ++ st1 {v24.8b}, [x12], #8 ++ st1 {v30.8b}, [x13], #8 ++ cbz w9, 3f ++ ++12: ++ sub w9, w9, #16 ++ ++ tbz w9, #3, 1f ++ ld3 {v0.8b, v1.8b, v2.8b}, [x10], #24 ++1: tbz w9, #2, 1f ++ ld3 {v0.b, v1.b, v2.b}[8], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[9], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[10], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[11], [x10], #3 ++1: tbz w9, #1, 1f ++ ld3 {v0.b, v1.b, v2.b}[12], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[13], [x10], #3 ++1: tbz w9, #0, 13b ++ ld3 {v0.b, v1.b, v2.b}[14], [x10], #3 ++ b 13b ++ ++2: ++ tbz w9, #3, 1f ++ st1 {v26.8b}, [x11], #8 ++ st1 {v24.s}[0], [x12], #4 ++ st1 {v30.s}[0], [x13], #4 ++1: tbz w9, #2, 1f ++ st1 {v26.s}[2], [x11], #4 ++ st1 {v24.h}[2], [x12], #2 ++ st1 {v30.h}[2], [x13], #2 ++1: tbz w9, #1, 1f ++ st1 {v26.h}[6], [x11], #2 ++ st1 {v24.b}[6], [x12], #1 ++ st1 {v30.b}[6], [x13], #1 ++1: tbz w9, #0, 1f ++ st1 {v26.b}[14], [x11] ++ st1 {v24.b}[7], [x12] ++ st1 {v30.b}[7], [x13] ++1: ++3: ++ ++// -------------------- Odd line body - Y only ++ ++ subs w5, w5, #1 ++ b.eq 90f ++ ++ subs w9, w4, #0 ++ add x0, x0, w14, SXTX ++ add x1, x1, w6, SXTX ++ mov x10, x0 ++ mov x11, x1 ++ b.lt 12f ++ ++ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 ++ subs w9, w9, #16 ++ b.le 13f ++ ++10: ++ uxtl v16.8h, v0.8b ++ uxtl v17.8h, v1.8b ++ uxtl v18.8h, v2.8b ++ ++ uxtl2 v20.8h, v0.16b ++ uxtl2 v21.8h, v1.16b ++ uxtl2 v22.8h, v2.16b ++ ++ // Testing shows it is faster to stack the smull/smlal ops together ++ // rather than interleave them between channels and indeed even the ++ // shift/add sections seem happier not interleaved ++ ++ // Y0 ++ smull v26.4s, v16.4h, v3.h[0] ++ smlal v26.4s, v17.4h, v4.h[0] ++ smlal v26.4s, v18.4h, v5.h[0] ++ smull2 v27.4s, v16.8h, v3.h[0] ++ smlal2 v27.4s, v17.8h, v4.h[0] ++ smlal2 v27.4s, v18.8h, v5.h[0] ++ // Y1 ++ smull v28.4s, v20.4h, v3.h[0] ++ smlal v28.4s, v21.4h, v4.h[0] ++ smlal v28.4s, v22.4h, v5.h[0] ++ smull2 v29.4s, v20.8h, v3.h[0] ++ smlal2 v29.4s, v21.8h, v4.h[0] ++ smlal2 v29.4s, v22.8h, v5.h[0] ++ ++ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 ++ ++ shrn v26.4h, v26.4s, #12 ++ shrn2 v26.8h, v27.4s, #12 ++ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) ++ sqrshrun v26.8b, v26.8h, #3 ++ shrn v28.4h, v28.4s, #12 ++ shrn2 v28.8h, v29.4s, #12 ++ add v28.8h, v28.8h, v6.8h ++ sqrshrun2 v26.16b, v28.8h, #3 ++ // Y0/Y1 ++ ++ subs w9, w9, #16 ++ ++ st1 {v26.16b}, [x11], #16 ++ ++ b.gt 10b ++ ++// -------------------- Odd line tail - Y ++// If width % 16 == 0 then simply runs once with preloaded RGB ++// If other then deals with preload & then does remaining tail ++ ++13: ++ // Body is simple copy of main loop body minus preload ++ ++ uxtl v16.8h, v0.8b ++ uxtl v17.8h, v1.8b ++ uxtl v18.8h, v2.8b ++ ++ uxtl2 v20.8h, v0.16b ++ uxtl2 v21.8h, v1.16b ++ uxtl2 v22.8h, v2.16b ++ ++ // Y0 ++ smull v26.4s, v16.4h, v3.h[0] ++ smlal v26.4s, v17.4h, v4.h[0] ++ smlal v26.4s, v18.4h, v5.h[0] ++ smull2 v27.4s, v16.8h, v3.h[0] ++ smlal2 v27.4s, v17.8h, v4.h[0] ++ smlal2 v27.4s, v18.8h, v5.h[0] ++ // Y1 ++ smull v28.4s, v20.4h, v3.h[0] ++ smlal v28.4s, v21.4h, v4.h[0] ++ smlal v28.4s, v22.4h, v5.h[0] ++ smull2 v29.4s, v20.8h, v3.h[0] ++ smlal2 v29.4s, v21.8h, v4.h[0] ++ smlal2 v29.4s, v22.8h, v5.h[0] ++ ++ cmp w9, #-16 ++ ++ shrn v26.4h, v26.4s, #12 ++ shrn2 v26.8h, v27.4s, #12 ++ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) ++ sqrshrun v26.8b, v26.8h, #3 ++ shrn v28.4h, v28.4s, #12 ++ shrn2 v28.8h, v29.4s, #12 ++ add v28.8h, v28.8h, v6.8h ++ sqrshrun2 v26.16b, v28.8h, #3 ++ // Y0/Y1 ++ ++ // Here: ++ // w9 == 0 width % 16 == 0, tail done ++ // w9 > -16 1st tail done (16 pels), remainder still to go ++ // w9 == -16 shouldn't happen ++ // w9 > -32 2nd tail done ++ // w9 <= -32 shouldn't happen ++ ++ b.lt 2f ++ st1 {v26.16b}, [x11], #16 ++ cbz w9, 3f ++ ++12: ++ sub w9, w9, #16 ++ ++ tbz w9, #3, 1f ++ ld3 {v0.8b, v1.8b, v2.8b}, [x10], #24 ++1: tbz w9, #2, 1f ++ ld3 {v0.b, v1.b, v2.b}[8], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[9], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[10], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[11], [x10], #3 ++1: tbz w9, #1, 1f ++ ld3 {v0.b, v1.b, v2.b}[12], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[13], [x10], #3 ++1: tbz w9, #0, 13b ++ ld3 {v0.b, v1.b, v2.b}[14], [x10], #3 ++ b 13b ++ ++2: ++ tbz w9, #3, 1f ++ st1 {v26.8b}, [x11], #8 ++1: tbz w9, #2, 1f ++ st1 {v26.s}[2], [x11], #4 ++1: tbz w9, #1, 1f ++ st1 {v26.h}[6], [x11], #2 ++1: tbz w9, #0, 1f ++ st1 {v26.b}[14], [x11] ++1: ++3: ++ ++// ------------------- Loop to start ++ ++ add x0, x0, w14, SXTX ++ add x1, x1, w6, SXTX ++ add x2, x2, w7, SXTX ++ add x3, x3, w7, SXTX ++ subs w5, w5, #1 ++ b.gt 11b ++90: ++ ret ++endfunc +diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c +index a7300f3ba4..ba1db155b0 100644 +--- a/libswscale/rgb2rgb.c ++++ b/libswscale/rgb2rgb.c +@@ -83,6 +83,31 @@ void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, + int width, int height, + int lumStride, int chromStride, int srcStride, + int32_t *rgb2yuv); ++void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); + void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, + int srcStride, int dstStride); + void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst, +diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h +index 48bba1586a..6329533f18 100644 +--- a/libswscale/rgb2rgb.h ++++ b/libswscale/rgb2rgb.h +@@ -82,6 +82,9 @@ void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size); + void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, + int chromStride, int srcStride, int32_t *rgb2yuv); ++void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv); + + /** + * Height should be a multiple of 2 and width should be a multiple of 16. +@@ -131,6 +134,26 @@ extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + int width, int height, + int lumStride, int chromStride, int srcStride, + int32_t *rgb2yuv); ++extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++extern void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++extern void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++extern void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++extern void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); + extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, + int srcStride, int dstStride); + +diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c +index 42c69801ba..e711589e1e 100644 +--- a/libswscale/rgb2rgb_template.c ++++ b/libswscale/rgb2rgb_template.c +@@ -646,13 +646,14 @@ static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst, + * others are ignored in the C version. + * FIXME: Write HQ version. + */ +-void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, +- int chromStride, int srcStride, int32_t *rgb2yuv) ++ int chromStride, int srcStride, int32_t *rgb2yuv, ++ const uint8_t x[9]) + { +- int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; +- int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; +- int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; ++ int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]]; ++ int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]]; ++ int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]]; + int y; + const int chromWidth = width >> 1; + +@@ -678,6 +679,19 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; + ydst[2 * i + 1] = Y; + } ++ if ((width & 1) != 0) { ++ unsigned int b = src[6 * i + 0]; ++ unsigned int g = src[6 * i + 1]; ++ unsigned int r = src[6 * i + 2]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128; ++ unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128; ++ ++ udst[i] = U; ++ vdst[i] = V; ++ ydst[2 * i] = Y; ++ } + ydst += lumStride; + src += srcStride; + +@@ -700,6 +714,125 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; + ydst[2 * i + 1] = Y; + } ++ if ((width & 1) != 0) { ++ unsigned int b = src[6 * i + 0]; ++ unsigned int g = src[6 * i + 1]; ++ unsigned int r = src[6 * i + 2]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ++ ydst[2 * i] = Y; ++ } ++ udst += chromStride; ++ vdst += chromStride; ++ ydst += lumStride; ++ src += srcStride; ++ } ++} ++ ++static const uint8_t x_rgb[9] = { ++ RY_IDX, GY_IDX, BY_IDX, ++ RU_IDX, GU_IDX, BU_IDX, ++ RV_IDX, GV_IDX, BV_IDX, ++}; ++ ++static const uint8_t x_bgr[9] = { ++ BY_IDX, GY_IDX, RY_IDX, ++ BU_IDX, GU_IDX, RU_IDX, ++ BV_IDX, GV_IDX, RV_IDX, ++}; ++ ++void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb); ++} ++ ++void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr); ++} ++ ++static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv, ++ const uint8_t x[9]) ++{ ++ int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]]; ++ int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]]; ++ int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]]; ++ int y; ++ const int chromWidth = width >> 1; ++ ++ for (y = 0; y < height; y += 2) { ++ int i; ++ for (i = 0; i < chromWidth; i++) { ++ unsigned int b = src[8 * i + 2]; ++ unsigned int g = src[8 * i + 1]; ++ unsigned int r = src[8 * i + 0]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128; ++ unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128; ++ ++ udst[i] = U; ++ vdst[i] = V; ++ ydst[2 * i] = Y; ++ ++ b = src[8 * i + 6]; ++ g = src[8 * i + 5]; ++ r = src[8 * i + 4]; ++ ++ Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ydst[2 * i + 1] = Y; ++ } ++ if ((width & 1) != 0) { ++ unsigned int b = src[8 * i + 2]; ++ unsigned int g = src[8 * i + 1]; ++ unsigned int r = src[8 * i + 0]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128; ++ unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128; ++ ++ udst[i] = U; ++ vdst[i] = V; ++ ydst[2 * i] = Y; ++ } ++ ydst += lumStride; ++ src += srcStride; ++ ++ if (y+1 == height) ++ break; ++ ++ for (i = 0; i < chromWidth; i++) { ++ unsigned int b = src[8 * i + 2]; ++ unsigned int g = src[8 * i + 1]; ++ unsigned int r = src[8 * i + 0]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ++ ydst[2 * i] = Y; ++ ++ b = src[8 * i + 6]; ++ g = src[8 * i + 5]; ++ r = src[8 * i + 4]; ++ ++ Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ydst[2 * i + 1] = Y; ++ } ++ if ((width & 1) != 0) { ++ unsigned int b = src[8 * i + 2]; ++ unsigned int g = src[8 * i + 1]; ++ unsigned int r = src[8 * i + 0]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ++ ydst[2 * i] = Y; ++ } + udst += chromStride; + vdst += chromStride; + ydst += lumStride; +@@ -707,6 +840,37 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + } + } + ++static void ff_rgbxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb); ++} ++ ++static void ff_bgrxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr); ++} ++ ++// As the general code does no SIMD-like ops simply adding 1 to the src address ++// will fix the ignored alpha position ++static void ff_xrgbtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb); ++} ++ ++static void ff_xbgrtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr); ++} ++ ++ + static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int width, int height, + int src1Stride, int src2Stride, int dstStride) +@@ -980,6 +1144,11 @@ static av_cold void rgb2rgb_init_c(void) + yuy2toyv12 = yuy2toyv12_c; + planar2x = planar2x_c; + ff_rgb24toyv12 = ff_rgb24toyv12_c; ++ ff_bgr24toyv12 = ff_bgr24toyv12_c; ++ ff_rgbxtoyv12 = ff_rgbxtoyv12_c; ++ ff_bgrxtoyv12 = ff_bgrxtoyv12_c; ++ ff_xrgbtoyv12 = ff_xrgbtoyv12_c; ++ ff_xbgrtoyv12 = ff_xbgrtoyv12_c; + interleaveBytes = interleaveBytes_c; + deinterleaveBytes = deinterleaveBytes_c; + vu9_to_vu12 = vu9_to_vu12_c; +diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c +index c4dd8a4d83..da38d7f8ac 100644 +--- a/libswscale/swscale_unscaled.c ++++ b/libswscale/swscale_unscaled.c +@@ -1655,6 +1655,91 @@ static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t *src[], + return srcSliceH; + } + ++static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_bgr24toyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ ++static int bgrxToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_bgrxtoyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ ++static int rgbxToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_rgbxtoyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ ++static int xbgrToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_xbgrtoyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ ++static int xrgbToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_xrgbtoyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ + static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[], + int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) +@@ -2035,6 +2120,32 @@ void ff_get_unscaled_swscale(SwsContext *c) + (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) && + !(flags & SWS_ACCURATE_RND)) + c->swscale = bgr24ToYv12Wrapper; ++ /* rgb24toYV12 */ ++ if (srcFormat == AV_PIX_FMT_RGB24 && ++ (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = rgb24ToYv12Wrapper; ++ ++ /* bgrxtoYV12 */ ++ if (((srcFormat == AV_PIX_FMT_BGRA && dstFormat == AV_PIX_FMT_YUV420P) || ++ (srcFormat == AV_PIX_FMT_BGR0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = bgrxToYv12Wrapper; ++ /* rgbx24toYV12 */ ++ if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) || ++ (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = rgbxToYv12Wrapper; ++ /* xbgrtoYV12 */ ++ if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) || ++ (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = xbgrToYv12Wrapper; ++ /* xrgb24toYV12 */ ++ if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) || ++ (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = xrgbToYv12Wrapper; + + /* RGB/BGR -> RGB/BGR (no dither needed forms) */ + if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c) +diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c +index 6c38041ddb..12776ffec7 100644 +--- a/libswscale/tests/swscale.c ++++ b/libswscale/tests/swscale.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + + #undef HAVE_AV_CONFIG_H + #include "libavutil/cpu.h" +@@ -78,6 +79,15 @@ struct Results { + uint32_t crc; + }; + ++static int time_rep = 0; ++ ++static uint64_t utime(void) ++{ ++ struct timespec ts; ++ clock_gettime(CLOCK_MONOTONIC, &ts); ++ return ts.tv_nsec / 1000 + (uint64_t)ts.tv_sec * 1000000; ++} ++ + // test by ref -> src -> dst -> out & compare out against ref + // ref & out are YV12 + static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h, +@@ -174,7 +184,7 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h, + goto end; + } + +- printf(" %s %dx%d -> %s %3dx%3d flags=%2d", ++ printf(" %s %4dx%4d -> %s %4dx%4d flags=%2d", + desc_src->name, srcW, srcH, + desc_dst->name, dstW, dstH, + flags); +@@ -182,6 +192,17 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h, + + sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride); + ++ if (time_rep != 0) ++ { ++ const uint64_t now = utime(); ++ uint64_t done; ++ for (i = 1; i != time_rep; ++i) { ++ sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride); ++ } ++ done = utime(); ++ printf(" T=%7"PRId64"us ", done-now); ++ } ++ + for (i = 0; i < 4 && dstStride[i]; i++) + crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i], + dstStride[i] * dstH); +@@ -355,56 +376,78 @@ static int fileTest(const uint8_t * const ref[4], int refStride[4], + return 0; + } + +-#define W 96 +-#define H 96 +- + int main(int argc, char **argv) + { ++ unsigned int W = 96; ++ unsigned int H = 96; ++ unsigned int W2; ++ unsigned int H2; ++ unsigned int S; + enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE; + enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE; +- uint8_t *rgb_data = av_malloc(W * H * 4); +- const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL }; +- int rgb_stride[4] = { 4 * W, 0, 0, 0 }; +- uint8_t *data = av_malloc(4 * W * H); +- const uint8_t * const src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 }; +- int stride[4] = { W, W, W, W }; + int x, y; + struct SwsContext *sws; + AVLFG rand; + int res = -1; + int i; + FILE *fp = NULL; +- +- if (!rgb_data || !data) +- return -1; ++ uint8_t *rgb_data; ++ uint8_t * rgb_src[4] = { NULL }; ++ int rgb_stride[4] = { 0 }; ++ uint8_t *data; ++ uint8_t * src[4] = { NULL }; ++ int stride[4] = { 0 }; + + for (i = 1; i < argc; i += 2) { ++ const char * const arg2 = argv[i+1]; ++ + if (argv[i][0] != '-' || i + 1 == argc) + goto bad_option; + if (!strcmp(argv[i], "-ref")) { +- fp = fopen(argv[i + 1], "r"); ++ fp = fopen(arg2, "r"); + if (!fp) { +- fprintf(stderr, "could not open '%s'\n", argv[i + 1]); ++ fprintf(stderr, "could not open '%s'\n", arg2); + goto error; + } + } else if (!strcmp(argv[i], "-cpuflags")) { + unsigned flags = av_get_cpu_flags(); +- int ret = av_parse_cpu_caps(&flags, argv[i + 1]); ++ int ret = av_parse_cpu_caps(&flags, arg2); + if (ret < 0) { +- fprintf(stderr, "invalid cpu flags %s\n", argv[i + 1]); ++ fprintf(stderr, "invalid cpu flags %s\n", arg2); + return ret; + } + av_force_cpu_flags(flags); + } else if (!strcmp(argv[i], "-src")) { +- srcFormat = av_get_pix_fmt(argv[i + 1]); ++ srcFormat = av_get_pix_fmt(arg2); + if (srcFormat == AV_PIX_FMT_NONE) { +- fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]); ++ fprintf(stderr, "invalid pixel format %s\n", arg2); + return -1; + } + } else if (!strcmp(argv[i], "-dst")) { +- dstFormat = av_get_pix_fmt(argv[i + 1]); ++ dstFormat = av_get_pix_fmt(arg2); + if (dstFormat == AV_PIX_FMT_NONE) { +- fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]); ++ fprintf(stderr, "invalid pixel format %s\n", arg2); ++ return -1; ++ } ++ } else if (!strcmp(argv[i], "-w")) { ++ char * p = NULL; ++ W = strtoul(arg2, &p, 0); ++ if (!W || *p) { ++ fprintf(stderr, "bad width %s\n", arg2); ++ return -1; ++ } ++ } else if (!strcmp(argv[i], "-h")) { ++ char * p = NULL; ++ H = strtoul(arg2, &p, 0); ++ if (!H || *p) { ++ fprintf(stderr, "bad height '%s'\n", arg2); ++ return -1; ++ } ++ } else if (!strcmp(argv[i], "-t")) { ++ char * p = NULL; ++ time_rep = (int)strtol(arg2, &p, 0); ++ if (*p) { ++ fprintf(stderr, "bad time repetitions '%s'\n", arg2); + return -1; + } + } else { +@@ -414,15 +457,34 @@ bad_option: + } + } + +- sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H, ++ S = (W + 15) & ~15; ++ rgb_data = av_mallocz(S * H * 4); ++ rgb_src[0] = rgb_data; ++ rgb_stride[0] = 4 * S; ++ data = av_mallocz(4 * S * H); ++ src[0] = data; ++ src[1] = data + S * H; ++ src[2] = data + S * H * 2; ++ src[3] = data + S * H * 3; ++ stride[0] = S; ++ stride[1] = S; ++ stride[2] = S; ++ stride[3] = S; ++ H2 = H < 96 ? 8 : H / 12; ++ W2 = W < 96 ? 8 : W / 12; ++ ++ if (!rgb_data || !data) ++ return -1; ++ ++ sws = sws_getContext(W2, H2, AV_PIX_FMT_RGB32, W, H, + AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL); + + av_lfg_init(&rand, 1); + + for (y = 0; y < H; y++) + for (x = 0; x < W * 4; x++) +- rgb_data[ x + y * 4 * W] = av_lfg_get(&rand); +- res = sws_scale(sws, rgb_src, rgb_stride, 0, H / 12, (uint8_t * const *) src, stride); ++ rgb_data[ x + y * 4 * S] = av_lfg_get(&rand); ++ res = sws_scale(sws, (const uint8_t * const *)rgb_src, rgb_stride, 0, H2, (uint8_t * const *) src, stride); + if (res < 0 || res != H) { + res = -1; + goto error; +@@ -431,10 +493,10 @@ bad_option: + av_free(rgb_data); + + if(fp) { +- res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat); ++ res = fileTest((const uint8_t * const *)src, stride, W, H, fp, srcFormat, dstFormat); + fclose(fp); + } else { +- selfTest(src, stride, W, H, srcFormat, dstFormat); ++ selfTest((const uint8_t * const *)src, stride, W, H, srcFormat, dstFormat); + res = 0; + } + error: diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt new file mode 100644 -index 0000000000..b050971f63 +index 0000000000..2b62d660c0 --- /dev/null +++ b/pi-util/BUILD.txt -@@ -0,0 +1,59 @@ +@@ -0,0 +1,67 @@ +Building Pi FFmpeg +================== + @@ -74,6 +70731,8 @@ index 0000000000..b050971f63 + paths being confused and therefore running the wrong code, Shared + is what is needed, in most cases, when building for use by other + programs. ++ --usr Set install dir to /usr (i.e. system default) rather than in ++ /install + +So for a static build +--------------------- @@ -87,25 +70746,31 @@ index 0000000000..b050971f63 +For a shared build +------------------ + ++There are two choices here ++ +$ pi-util/conf_native.sh -+ -+You will normally want an install target if shared. Note that the script has -+set this up to be generated in out//install, you don't have to worry -+about overwriting your system libs. -+ +$ make -j8 -C out/ install + ++This sets the install prefix to /install and is probably what you ++want if you don't want to overwrite the system files. ++ +You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was -+built or install the image on the system - you have to be careful to get rid -+of all other ffmpeg libs or confusion may result. There is a little script -+that wipes all other versions - obviously use with care! ++built. You can copy the contents of /install to /usr and that mostly ++works. The only downside is that paths in pkgconfig end up being set to the ++install directory in your build directory which may be less than ideal when ++building other packages. + ++The alternative if you just want to replace the system libs is: ++ ++$ pi-util/conf_native.sh --usr ++$ make -j8 -C out/ +$ sudo pi-util/clean_usr_libs.sh ++$ sudo make -j8 -C out/ install + -+Then simply copying from the install to /usr works -+ -+$ sudo cp -r out//install/* /usr -+ ++The clean_usr_libs.sh step wipes any existing libs & includes (for all ++architectures) from the system which helps avoid confusion when running other ++progs as you can be sure you're not running old code which is unfortunately ++easy to do otherwise. + diff --git a/pi-util/NOTES.txt b/pi-util/NOTES.txt new file mode 100644 @@ -188,95 +70853,111 @@ index 0000000000..92bc13a3df --- /dev/null +++ b/pi-util/TESTMESA.txt @@ -0,0 +1,82 @@ -+# Setup & Build instructions for testing Argon30 mesa support (on Pi4) -+ -+# These assume that the drm_mmal test for Sand8 has been built on this Pi -+# as build relies on many of the same files -+ -+# 1st get everything required to build ffmpeg -+# If sources aren't already enabled on your Pi then enable them -+sudo su -+sed "s/#deb-src/deb-src/" /etc/apt/sources.list > /tmp/sources.list -+sed "s/#deb-src/deb-src/" /etc/apt/sources.list.d/raspi.list > /tmp/raspi.list -+mv /tmp/sources.list /etc/apt/ -+mv /tmp/raspi.list /etc/apt/sources.list.d/ -+apt update -+ -+# Get dependancies -+sudo apt build-dep ffmpeg -+ -+sudo apt install meson libepoxy-dev libxcb-dri3-dev libxcb1-dev libx11-dev libx11-xcb-dev libdrm-dev -+ -+# Enable H265 V4L2 request decoder -+sudo su -+echo dtoverlay=rpivid-v4l2 >> /boot/config.txt -+# You may also want to add more CMA if you are going to try 4k videos -+# Change the dtoverlay=vc4-fkms-v3d line in config.txt to read -+# dtoverlay=vc4-fkms-v3d,cma-512 -+reboot -+# Check it has turned up -+ls -la /dev/video* -+# This should include video19 -+# crw-rw----+ 1 root video 81, 7 Aug 4 17:25 /dev/video19 -+ -+# Currently on the Pi the linux headers from the debian distro don't match -+# the kernel that we ship and we need to update them - hopefully this step -+# will be unneeded in the future -+sudo apt install git bc bison flex libssl-dev make -+git clone --depth=1 https://github.com/raspberrypi/linux --branch rpi-5.10.y -+cd linux -+KERNEL=kernel7l -+make bcm2711_defconfig -+make headers_install -+sudo cp -r usr/include/linux /usr/include -+cd .. -+ -+# Config - this builds a staticly linked ffmpeg which is easier for testing -+pi-util/conf_native.sh --noshared -+ -+# Build (this is a bit dull) -+# If you want to poke the source the libavdevice/egl_vout.c contains the -+# output code - -+cd out/armv7-static-rel -+ -+# Check that you have actually configured V4L2 request -+grep HEVC_V4L2REQUEST config.h -+# You are hoping for -+# #define CONFIG_HEVC_V4L2REQUEST_HWACCEL 1 -+# if you get 0 then the config has failed -+ -+make -j6 -+ -+# Grab test streams -+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-h264.mkv -+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc.mkv -+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc-10bit.mkv -+ -+# Test i420 output (works currently) -+./ffmpeg -no_cvt_hw -vcodec h264_v4l2m2m -i jellyfish-3-mbps-hd-h264.mkv -f vout_egl - -+ -+# Test Sand8 output - doesn't currently work but should once you have -+# Sand8 working in drm_mmal. I can't guarantee that this will work as -+# I can't test this path with a known working format, but the debug looks -+# good. If this doesn't work & drm_mmal does with sand8 then come back to me -+# The "show_all 1" forces vout to display every frame otherwise it drops any -+# frame that would cause it to block -+./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc.mkv -show_all 1 -f vout_egl - -+ -+# Test Sand30 - doesn't currently work -+# (Beware that when FFmpeg errors out it often leaves your teminal window -+# in a state where you need to reset it) -+./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc-10bit.mkv -f vout_egl - -+ -+ -+ ++# Setup & Build instructions for testing Argon30 mesa support (on Pi4) ++ ++# These assume that the drm_mmal test for Sand8 has been built on this Pi ++# as build relies on many of the same files ++ ++# 1st get everything required to build ffmpeg ++# If sources aren't already enabled on your Pi then enable them ++sudo su ++sed "s/#deb-src/deb-src/" /etc/apt/sources.list > /tmp/sources.list ++sed "s/#deb-src/deb-src/" /etc/apt/sources.list.d/raspi.list > /tmp/raspi.list ++mv /tmp/sources.list /etc/apt/ ++mv /tmp/raspi.list /etc/apt/sources.list.d/ ++apt update ++ ++# Get dependancies ++sudo apt build-dep ffmpeg ++ ++sudo apt install meson libepoxy-dev libxcb-dri3-dev libxcb1-dev libx11-dev libx11-xcb-dev libdrm-dev ++ ++# Enable H265 V4L2 request decoder ++sudo su ++echo dtoverlay=rpivid-v4l2 >> /boot/config.txt ++# You may also want to add more CMA if you are going to try 4k videos ++# Change the dtoverlay=vc4-fkms-v3d line in config.txt to read ++# dtoverlay=vc4-fkms-v3d,cma-512 ++reboot ++# Check it has turned up ++ls -la /dev/video* ++# This should include video19 ++# crw-rw----+ 1 root video 81, 7 Aug 4 17:25 /dev/video19 ++ ++# Currently on the Pi the linux headers from the debian distro don't match ++# the kernel that we ship and we need to update them - hopefully this step ++# will be unneeded in the future ++sudo apt install git bc bison flex libssl-dev make ++git clone --depth=1 https://github.com/raspberrypi/linux --branch rpi-5.10.y ++cd linux ++KERNEL=kernel7l ++make bcm2711_defconfig ++make headers_install ++sudo cp -r usr/include/linux /usr/include ++cd .. ++ ++# Config - this builds a staticly linked ffmpeg which is easier for testing ++pi-util/conf_native.sh --noshared ++ ++# Build (this is a bit dull) ++# If you want to poke the source the libavdevice/egl_vout.c contains the ++# output code - ++cd out/armv7-static-rel ++ ++# Check that you have actually configured V4L2 request ++grep HEVC_V4L2REQUEST config.h ++# You are hoping for ++# #define CONFIG_HEVC_V4L2REQUEST_HWACCEL 1 ++# if you get 0 then the config has failed ++ ++make -j6 ++ ++# Grab test streams ++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-h264.mkv ++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc.mkv ++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc-10bit.mkv ++ ++# Test i420 output (works currently) ++./ffmpeg -no_cvt_hw -vcodec h264_v4l2m2m -i jellyfish-3-mbps-hd-h264.mkv -f vout_egl - ++ ++# Test Sand8 output - doesn't currently work but should once you have ++# Sand8 working in drm_mmal. I can't guarantee that this will work as ++# I can't test this path with a known working format, but the debug looks ++# good. If this doesn't work & drm_mmal does with sand8 then come back to me ++# The "show_all 1" forces vout to display every frame otherwise it drops any ++# frame that would cause it to block ++./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc.mkv -show_all 1 -f vout_egl - ++ ++# Test Sand30 - doesn't currently work ++# (Beware that when FFmpeg errors out it often leaves your teminal window ++# in a state where you need to reset it) ++./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc-10bit.mkv -f vout_egl - ++ ++ ++ diff --git a/pi-util/clean_usr_libs.sh b/pi-util/clean_usr_libs.sh new file mode 100755 -index 0000000000..b3b2d5509d +index 0000000000..01bd6a6a22 --- /dev/null +++ b/pi-util/clean_usr_libs.sh -@@ -0,0 +1,26 @@ +@@ -0,0 +1,42 @@ +set -e ++U=/usr/include/arm-linux-gnueabihf ++rm -rf $U/libavcodec ++rm -rf $U/libavdevice ++rm -rf $U/libavfilter ++rm -rf $U/libavformat ++rm -rf $U/libavutil ++rm -rf $U/libswresample ++rm -rf $U/libswscale ++U=/usr/include/aarch64-linux-gnu ++rm -rf $U/libavcodec ++rm -rf $U/libavdevice ++rm -rf $U/libavfilter ++rm -rf $U/libavformat ++rm -rf $U/libavutil ++rm -rf $U/libswresample ++rm -rf $U/libswscale +U=/usr/lib/arm-linux-gnueabihf +rm -f $U/libavcodec.* +rm -f $U/libavdevice.* @@ -359,510 +71040,510 @@ index 0000000000..4efd5d1c67 --- /dev/null +++ b/pi-util/conf_h265.2016.csv @@ -0,0 +1,195 @@ -+1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5,8 -+1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5,8 -+1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5,8 -+1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5,8 -+1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5,8 -+1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5,8 -+1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5,8 -+1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5,8 -+1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5,8 -+1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5,8 -+1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5,8 -+1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5,8 -+1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5,8 -+1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5,8 -+1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5,8 -+1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5,8 -+1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5,8 -+1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5,8 -+1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5,8 -+1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5,8 -+1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5,8 -+1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5,10 -+1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5,8 -+1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5,8 -+1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5,8 -+1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5,8 -+1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5,8 -+1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5,8 -+1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5,8 -+1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5,8 -+1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5,8 -+1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5,8 -+1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5,8 -+1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5,8 -+1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5,8 -+1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5,8 -+1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5,8 -+1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5,8 -+1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5,8 -+1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5,8 -+1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5,8 -+1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5,8 -+1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5,10 -+1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5,8 -+1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5,8 -+1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5,8 -+1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5,8 -+1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5,8 -+1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5,8 -+1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5,8 -+1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5,8 -+1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5,8 -+1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5,8 -+1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5,8 -+1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5,8 -+1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5,8 -+1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5,8 -+1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5,8 -+1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5,8 -+1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5,8 -+1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5,8 -+1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5,8 -+1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5,8 -+1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5,8 -+1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5,8 -+1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5,8 -+1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5,8 -+1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5,8 -+1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5,8 -+1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5,8 -+1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5,8 -+1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5,8 -+1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5,8 -+1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5,8 -+1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5,8 -+1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5,8 -+1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5,8 -+1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5,8 -+1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5,8 -+1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5,8 -+1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5,8 -+1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5,8 -+1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5,8 -+1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5,8 -+1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5,8 -+1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5,8 -+1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5,8 -+1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5,8 -+1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5,8 -+1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5,8 -+1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5,8 -+1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5,8 -+1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5,8 -+1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5,8 -+1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5,8 -+1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5,8 -+1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5,8 -+1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5,8 -+1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5,8 -+1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5,8 -+1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5,8 -+1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5,8 -+1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5,8 -+1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5,8 -+1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5,8 -+1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5,8 -+1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5,8 -+1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5,8 -+1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5,8 -+1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5,8 -+1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt,8 -+1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt,8 -+1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5,8 -+1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5,8 -+1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5,8 -+1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5,8 -+1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5,8 -+1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5,8 -+1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5,8 -+1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5,8 -+1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5,8 -+1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5,8 -+1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5,8 -+1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5,8 -+1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5,8 -+1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5,8 -+1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5,8 -+3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth,10 -+1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5,8 -+1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5,8 -+3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???,8 -+1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5,10 -+1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5,8 -+1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5,8 -+1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5,10 -+1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5,10 -+1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5,8 -+1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5,10 -+1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5,8 -+1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5,10 -+1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5,8 -+1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5,10 -+1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5,8 -+1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5,10 -+1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5,8 -+1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5,10 -+1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5,8 -+1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5,0 -+0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt,8 -+0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt,8 -+0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt,10 -+0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt,8 -+0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt,8 -+1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt,0 -+0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt,8 -+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5,10 -+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5,8 -+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5,8 -+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5,8 -+0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5,10 -+0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5,8 -+0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5,8 -+0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5,8 -+1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5,10 -+1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5,0 -+1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5,0 -+1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5,0 -+1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5,0 -+1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5,0 -+1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5,0 -+0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5,0 -+0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5,8 -+0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5,8 -+1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5,0 -+1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5,8 -+1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5,0 -+1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5,0 -+1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5,0 -+1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt,0 -+1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt,0 -+1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5,0 -+1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5,0 -+0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed,8 -+0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5,10 -+0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5,10 -+0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5,8 -+0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5,8 -+0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5,8 -+0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5,8 -+0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5,8 -+1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5,8 -+1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5,8 -+1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5,8 -+1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5,8 -+1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5,8 ++1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5,8 ++1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5,8 ++1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5,8 ++1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5,8 ++1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5,8 ++1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5,8 ++1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5,8 ++1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5,8 ++1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5,8 ++1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5,8 ++1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5,8 ++1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5,8 ++1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5,8 ++1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5,8 ++1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5,8 ++1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5,8 ++1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5,8 ++1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5,8 ++1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5,8 ++1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5,8 ++1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5,8 ++1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5,10 ++1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5,8 ++1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5,8 ++1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5,8 ++1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5,8 ++1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5,8 ++1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5,8 ++1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5,8 ++1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5,8 ++1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5,8 ++1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5,8 ++1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5,8 ++1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5,8 ++1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5,8 ++1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5,8 ++1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5,8 ++1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5,8 ++1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5,8 ++1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5,8 ++1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5,8 ++1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5,8 ++1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5,10 ++1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5,8 ++1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5,8 ++1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5,8 ++1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5,8 ++1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5,8 ++1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5,8 ++1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5,8 ++1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5,8 ++1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5,8 ++1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5,8 ++1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5,8 ++1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5,8 ++1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5,8 ++1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5,8 ++1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5,8 ++1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5,8 ++1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5,8 ++1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5,8 ++1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5,8 ++1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5,8 ++1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5,8 ++1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5,8 ++1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5,8 ++1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5,8 ++1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5,8 ++1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5,8 ++1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5,8 ++1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5,8 ++1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5,8 ++1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5,8 ++1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5,8 ++1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5,8 ++1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5,8 ++1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5,8 ++1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5,8 ++1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5,8 ++1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5,8 ++1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5,8 ++1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5,8 ++1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5,8 ++1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5,8 ++1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5,8 ++1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5,8 ++1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5,8 ++1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5,8 ++1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5,8 ++1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5,8 ++1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5,8 ++1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5,8 ++1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5,8 ++1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5,8 ++1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5,8 ++1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5,8 ++1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5,8 ++1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5,8 ++1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5,8 ++1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5,8 ++1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5,8 ++1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5,8 ++1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5,8 ++1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5,8 ++1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5,8 ++1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5,8 ++1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5,8 ++1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5,8 ++1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5,8 ++1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5,8 ++1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt,8 ++1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt,8 ++1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5,8 ++1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5,8 ++1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5,8 ++1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5,8 ++1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5,8 ++1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5,8 ++1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5,8 ++1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5,8 ++1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5,8 ++1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5,8 ++1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5,8 ++1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5,8 ++1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5,8 ++1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5,8 ++1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5,8 ++3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth,10 ++1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5,8 ++1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5,8 ++3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???,8 ++1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5,10 ++1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5,8 ++1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5,8 ++1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5,10 ++1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5,10 ++1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5,8 ++1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5,10 ++1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5,8 ++1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5,10 ++1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5,8 ++1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5,10 ++1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5,8 ++1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5,10 ++1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5,8 ++1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5,10 ++1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5,8 ++1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5,0 ++0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt,8 ++0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt,8 ++0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt,10 ++0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt,8 ++0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt,8 ++1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt,0 ++0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt,8 ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5,10 ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5,8 ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5,8 ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5,8 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5,10 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5,8 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5,8 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5,8 ++1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5,10 ++1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5,0 ++1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5,0 ++1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5,0 ++1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5,0 ++1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5,0 ++1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5,0 ++0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5,0 ++0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5,8 ++0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5,8 ++1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5,0 ++1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5,8 ++1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5,0 ++1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5,0 ++1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5,0 ++1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt,0 ++1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt,0 ++1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5,0 ++1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5,0 ++0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed,8 ++0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5,10 ++0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5,10 ++0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5,8 ++0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5,8 ++0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5,8 ++0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5,8 ++0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5,8 ++1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5,8 ++1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5,8 ++1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5,8 ++1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5,8 ++1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5,8 diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv new file mode 100644 index 0000000000..6082641271 --- /dev/null +++ b/pi-util/conf_h265.2016_HEVC_v1.csv @@ -0,0 +1,147 @@ -+1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5 -+1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5 -+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 -+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5 -+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5 -+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5 -+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5 -+1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5 -+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5 -+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5 -+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5 -+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5 -+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5 -+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5 -+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5 -+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5 -+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5 -+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5 -+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5 -+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5 -+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5 -+1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5 -+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5 -+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5 -+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5 -+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5 -+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5 -+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5 -+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5 -+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 -+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5 -+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5 -+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5 -+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5 -+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5 -+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5 -+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5 -+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5 -+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5 -+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5 -+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5 -+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5 -+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5 -+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5 -+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5 -+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5 -+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5 -+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5 -+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5 -+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5 -+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5 -+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5 -+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5 -+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5 -+1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5 -+1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5 -+1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5 -+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5 -+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5 -+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5 -+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5 -+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5 -+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5 -+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5 -+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5 -+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5 -+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5 -+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5 -+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5 -+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 -+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5 -+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5 -+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5 -+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5 -+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5 -+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5 -+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5 -+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5 -+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5 -+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5 -+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5 -+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5 -+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5 -+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 -+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5 -+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5 -+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 -+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5 -+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5 -+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5 -+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5 -+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5 -+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5 -+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5 -+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5 -+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5 -+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5 -+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5 -+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5 -+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5 -+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5 -+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5 -+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5 -+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5 -+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5 -+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5 -+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5 -+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5 -+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5 -+1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5 -+2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt -+2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt -+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5 -+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5 -+1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5 -+1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5 -+1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5 -+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5 -+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 -+1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5 -+1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5 -+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5 -+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5 -+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5 -+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5 -+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5 -+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 -+3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth -+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 -+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 -+3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ??? -+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 -+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5 -+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5 -+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5 -+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5 -+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5 -+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5 -+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5 -+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5 -+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5 -+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5 -+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5 -+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5 -+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 -+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 -+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 ++1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5 ++1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5 ++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 ++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5 ++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5 ++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5 ++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5 ++1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5 ++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5 ++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5 ++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5 ++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5 ++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5 ++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5 ++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5 ++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5 ++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5 ++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5 ++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5 ++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5 ++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5 ++1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5 ++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5 ++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5 ++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5 ++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5 ++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5 ++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5 ++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5 ++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 ++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5 ++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5 ++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5 ++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5 ++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5 ++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5 ++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5 ++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5 ++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5 ++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5 ++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5 ++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5 ++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5 ++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5 ++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5 ++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5 ++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5 ++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5 ++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5 ++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5 ++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5 ++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5 ++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5 ++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5 ++1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5 ++1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5 ++1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5 ++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5 ++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5 ++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5 ++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5 ++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5 ++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5 ++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5 ++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5 ++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5 ++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5 ++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5 ++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5 ++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 ++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5 ++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5 ++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5 ++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5 ++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5 ++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5 ++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5 ++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5 ++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5 ++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5 ++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5 ++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5 ++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5 ++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 ++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5 ++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5 ++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 ++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5 ++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5 ++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5 ++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5 ++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5 ++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5 ++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5 ++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5 ++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5 ++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5 ++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5 ++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5 ++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5 ++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5 ++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5 ++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5 ++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5 ++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5 ++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5 ++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5 ++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5 ++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5 ++1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5 ++2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt ++2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt ++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5 ++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5 ++1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5 ++1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5 ++1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5 ++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5 ++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 ++1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5 ++1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5 ++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5 ++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5 ++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5 ++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5 ++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5 ++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 ++3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth ++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 ++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 ++3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ??? ++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 ++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5 ++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5 ++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5 ++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5 ++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5 ++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5 ++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5 ++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5 ++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5 ++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5 ++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5 ++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5 ++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 ++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 ++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv new file mode 100644 index 0000000000..fc14f2a3c2 --- /dev/null +++ b/pi-util/conf_h265.csv @@ -0,0 +1,144 @@ -+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5 -+1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5 -+1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5 -+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 -+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5 -+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5 -+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5 -+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5 -+1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5 -+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5 -+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5 -+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5 -+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5 -+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5 -+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5 -+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5 -+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5 -+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5 -+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5 -+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5 -+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5 -+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5 -+1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5 -+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5 -+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5 -+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5 -+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5 -+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5 -+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5 -+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5 -+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 -+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5 -+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5 -+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5 -+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5 -+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5 -+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5 -+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5 -+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5 -+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5 -+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5 -+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5 -+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5 -+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5 -+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5 -+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5 -+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5 -+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5 -+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5 -+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5 -+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5 -+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5 -+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5 -+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5 -+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5 -+1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5 -+1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5 -+1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5 -+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5 -+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5 -+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5 -+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5 -+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5 -+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5 -+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5 -+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5 -+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5 -+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5 -+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5 -+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5 -+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 -+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5 -+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5 -+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5 -+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5 -+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5 -+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5 -+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5 -+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5 -+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5 -+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5 -+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5 -+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5 -+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5 -+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 -+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5 -+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5 -+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 -+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5 -+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5 -+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5 -+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5 -+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5 -+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5 -+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5 -+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5 -+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5 -+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5 -+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5 -+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5 -+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5 -+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5 -+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5 -+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5 -+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5 -+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5 -+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5 -+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5 -+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5 -+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5 -+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5 -+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5 -+1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5 -+1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5 -+1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5 -+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5 -+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 -+1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5 -+1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5 -+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5 -+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5 -+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5 -+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5 -+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5 -+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 -+0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched -+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 -+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 -+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 -+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5 -+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5 -+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5 -+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5 -+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5 -+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5 -+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5 -+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5 -+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5 -+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5 -+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5 -+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5 -+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 -+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 -+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 ++1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5 ++1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5 ++1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5 ++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 ++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5 ++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5 ++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5 ++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5 ++1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5 ++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5 ++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5 ++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5 ++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5 ++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5 ++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5 ++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5 ++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5 ++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5 ++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5 ++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5 ++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5 ++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5 ++1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5 ++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5 ++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5 ++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5 ++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5 ++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5 ++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5 ++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5 ++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 ++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5 ++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5 ++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5 ++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5 ++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5 ++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5 ++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5 ++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5 ++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5 ++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5 ++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5 ++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5 ++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5 ++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5 ++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5 ++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5 ++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5 ++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5 ++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5 ++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5 ++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5 ++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5 ++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5 ++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5 ++1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5 ++1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5 ++1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5 ++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5 ++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5 ++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5 ++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5 ++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5 ++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5 ++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5 ++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5 ++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5 ++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5 ++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5 ++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5 ++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 ++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5 ++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5 ++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5 ++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5 ++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5 ++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5 ++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5 ++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5 ++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5 ++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5 ++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5 ++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5 ++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5 ++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 ++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5 ++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5 ++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 ++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5 ++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5 ++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5 ++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5 ++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5 ++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5 ++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5 ++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5 ++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5 ++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5 ++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5 ++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5 ++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5 ++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5 ++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5 ++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5 ++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5 ++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5 ++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5 ++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5 ++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5 ++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5 ++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5 ++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5 ++1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5 ++1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5 ++1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5 ++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5 ++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 ++1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5 ++1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5 ++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5 ++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5 ++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5 ++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5 ++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5 ++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 ++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched ++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 ++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 ++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 ++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5 ++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5 ++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5 ++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5 ++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5 ++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5 ++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5 ++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5 ++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5 ++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5 ++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5 ++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5 ++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 ++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 ++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh new file mode 100755 -index 0000000000..65576846e8 +index 0000000000..5fb69ccee2 --- /dev/null +++ b/pi-util/conf_native.sh -@@ -0,0 +1,108 @@ +@@ -0,0 +1,127 @@ +echo "Configure for native build" + +FFSRC=`pwd` @@ -874,6 +71555,7 @@ index 0000000000..65576846e8 + +NOSHARED= +MMAL= ++USR_PREFIX= + +while [ "$1" != "" ] ; do + case $1 in @@ -883,8 +71565,14 @@ index 0000000000..65576846e8 + --mmal) + MMAL=1 + ;; ++ --usr) ++ USR_PREFIX=/usr ++ ;; + *) -+ echo "Usage $0: [--noshared] [--mmal]" ++ echo "Usage $0: [--noshared] [--mmal] [--usr]" ++ echo " noshared Build static libs and executable - good for testing" ++ echo " mmal Build mmal decoders" ++ echo " usr Set install prefix to /usr [default=/install]" + exit 1 + ;; + esac @@ -898,18 +71586,28 @@ index 0000000000..65576846e8 +RPI_DEFINES= +RPI_EXTRALIBS= + -+if [ "$MC" == "arm64" ]; then -+ echo "M/C aarch64" -+ A=aarch64-linux-gnu -+ B=arm64 -+elif [ "$MC" == "armhf" ]; then -+ echo "M/C armv7" -+ A=arm-linux-gnueabihf -+ B=armv7 -+ MCOPTS="--arch=armv6t2 --cpu=cortex-a7" -+ RPI_DEFINES=-mfpu=neon-vfpv4 ++# uname -m gives kernel type which may not have the same ++# 32/64bitness as userspace :-( getconf shoudl provide the answer ++# but use uname to check we are on the right processor ++MC=`uname -m` ++LB=`getconf LONG_BIT` ++if [ "$MC" == "armv7l" ] || [ "$MC" == "aarch64" ]; then ++ if [ "$LB" == "32" ]; then ++ echo "M/C armv7" ++ A=arm-linux-gnueabihf ++ B=armv7 ++ MCOPTS="--arch=armv6t2 --cpu=cortex-a7" ++ RPI_DEFINES=-mfpu=neon-vfpv4 ++ elif [ "$LB" == "64" ]; then ++ echo "M/C aarch64" ++ A=aarch64-linux-gnu ++ B=arm64 ++ else ++ echo "Unknown LONG_BIT name: $LB" ++ exit 1 ++ fi +else -+ echo Unexpected architecture $MC ++ echo "Unknown machine name: $MC" + exit 1 +fi + @@ -937,7 +71635,9 @@ index 0000000000..65576846e8 + OUT=$BUILDBASE/$B-$C-$V-shared-rel +fi + -+USR_PREFIX=$OUT/install ++if [ ! $USR_PREFIX ]; then ++ USR_PREFIX=$OUT/install ++fi +LIB_PREFIX=$USR_PREFIX/lib/$A +INC_PREFIX=$USR_PREFIX/include/$A + @@ -956,10 +71656,9 @@ index 0000000000..65576846e8 + --disable-thumb\ + --enable-v4l2-request\ + --enable-libdrm\ -+ --enable-epoxy\ -+ --enable-libudev\ + --enable-vout-egl\ + --enable-vout-drm\ ++ --enable-gpl\ + $SHARED_LIBS\ + $RPIOPTS\ + --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\ @@ -968,6 +71667,7 @@ index 0000000000..65576846e8 + --extra-libs="$RPI_EXTRALIBS"\ + --extra-version="rpi" + ++echo "Configured into $OUT" + +# gcc option for getting asm listing +# -Wa,-ahls @@ -1544,6 +72244,95 @@ index 0000000000..a4dbb6eacd +$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c +$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h + +diff --git a/pi-util/testfilt.py b/pi-util/testfilt.py +new file mode 100755 +index 0000000000..b322dac0c2 +--- /dev/null ++++ b/pi-util/testfilt.py +@@ -0,0 +1,83 @@ ++#!/usr/bin/env python3 ++ ++import string ++import os ++import subprocess ++import re ++import argparse ++import sys ++import csv ++from stat import * ++ ++class validator: ++ def __init__(self): ++ self.ok = False ++ ++ def isok(self): ++ return self.ok ++ ++ def setok(self): ++ self.ok = True ++ ++class valid_regex(validator): ++ def __init__(self, regex): ++ super().__init__() ++ self.regex = re.compile(regex) ++ ++ def scanline(self, line): ++ if self.isok() or self.regex.search(line): ++ self.setok() ++ ++ ++def validate(validators, flog): ++ for line in flog: ++ for v in validators: ++ v.scanline(line) ++ ++ ok = True ++ for v in validators: ++ if not v.isok(): ++ ok = False ++ # complain ++ print("Test failed") ++ ++ if ok: ++ print("OK") ++ return ok ++ ++def runtest(name, ffmpeg, args, suffix, validators): ++ log_root = os.path.join("/tmp", "testfilt", name) ++ ofilename = os.path.join(log_root, name + suffix) ++ ++ if not os.path.exists(log_root): ++ os.makedirs(log_root) ++ ++ try: ++ os.remove(ofilename) ++ except: ++ pass ++ ++ flog = open(os.path.join(log_root, name + ".log"), "wb") ++ ffargs = [ffmpeg] + args + [ofilename] ++ ++ subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT, text=False) ++ flog.close ++ ++ flog = open(os.path.join(log_root, name + ".log"), "rt") ++ return validate(validators, flog) ++ ++def sayok(log_root, flog): ++ print("Woohoo") ++ return True ++ ++if __name__ == '__main__': ++ ++ argp = argparse.ArgumentParser(description="FFmpeg filter tester") ++ argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name") ++ args = argp.parse_args() ++ ++ runtest("ATest", args.ffmpeg, ["-v", "verbose", "-no_cvt_hw", "-an", "-c:v", "h264_v4l2m2m", "-i", ++ "/home/johncox/server/TestMedia/Sony/jellyfish-10-mbps-hd-h264.mkv", ++# "/home/jc/rpi/streams/jellyfish-3-mbps-hd-h264.mkv", ++ "-c:v", "h264_v4l2m2m", "-b:v", "2M"], ".mkv", ++ [valid_regex(r'Output stream #0:0 \(video\): 900 frames encoded; 900 packets muxed')]) diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py new file mode 100755 index 0000000000..5935a11ca5 @@ -1678,33280 +72467,1084 @@ index 0000000000..5935a11ca5 + + do_logparse(args.logfile) + - -From f3eaadb27a5bc6db07d33ce0814d796e8cee623e Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 27 Apr 2021 11:27:39 +0100 -Subject: [PATCH 002/136] Add sand pix fmts & conversion fns - ---- - configure | 3 + - libavutil/Makefile | 3 + - libavutil/arm/Makefile | 1 + - libavutil/arm/rpi_sand_neon.S | 768 ++++++++++++++++++++++++++++++++++ - libavutil/arm/rpi_sand_neon.h | 99 +++++ - libavutil/pixdesc.c | 44 ++ - libavutil/pixfmt.h | 6 + - libavutil/rpi_sand_fn_pw.h | 227 ++++++++++ - libavutil/rpi_sand_fns.c | 353 ++++++++++++++++ - libavutil/rpi_sand_fns.h | 183 ++++++++ - 10 files changed, 1687 insertions(+) - create mode 100644 libavutil/arm/rpi_sand_neon.S - create mode 100644 libavutil/arm/rpi_sand_neon.h - create mode 100644 libavutil/rpi_sand_fn_pw.h - create mode 100644 libavutil/rpi_sand_fns.c - create mode 100644 libavutil/rpi_sand_fns.h - -diff --git a/configure b/configure -index b6616f00b6..27112ced58 100755 ---- a/configure -+++ b/configure -@@ -344,6 +344,7 @@ External library support: - --enable-libvpl enable Intel oneVPL code via libvpl if libmfx is not used [no] - --enable-libnpp enable Nvidia Performance Primitives-based code [no] - --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no] -+ --enable-sand enable sand video formats [rpi] - --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect] - --disable-nvenc disable Nvidia video encoding code [autodetect] - --enable-omx enable OpenMAX IL code [no] -@@ -1930,6 +1931,7 @@ FEATURE_LIST=" - omx_rpi - runtime_cpudetect - safe_bitstream_reader -+ sand - shared - small - static -@@ -2495,6 +2497,7 @@ CONFIG_EXTRA=" - rtpdec - rtpenc_chain - rv34dsp -+ sand - scene_sad - sinewin - snappy -diff --git a/libavutil/Makefile b/libavutil/Makefile -index dc9012f9a8..e33f5db099 100644 ---- a/libavutil/Makefile -+++ b/libavutil/Makefile -@@ -73,6 +73,7 @@ HEADERS = adler32.h \ - rational.h \ - replaygain.h \ - ripemd.h \ -+ rpi_sand_fns.h \ - samplefmt.h \ - sha.h \ - sha512.h \ -@@ -192,6 +193,7 @@ OBJS-$(CONFIG_MACOS_KPERF) += macos_kperf.o - OBJS-$(CONFIG_MEDIACODEC) += hwcontext_mediacodec.o - OBJS-$(CONFIG_OPENCL) += hwcontext_opencl.o - OBJS-$(CONFIG_QSV) += hwcontext_qsv.o -+OBJS-$(CONFIG_SAND) += rpi_sand_fns.o - OBJS-$(CONFIG_VAAPI) += hwcontext_vaapi.o - OBJS-$(CONFIG_VIDEOTOOLBOX) += hwcontext_videotoolbox.o - OBJS-$(CONFIG_VDPAU) += hwcontext_vdpau.o -@@ -212,6 +214,7 @@ SKIPHEADERS-$(CONFIG_D3D11VA) += hwcontext_d3d11va.h - SKIPHEADERS-$(CONFIG_DXVA2) += hwcontext_dxva2.h - SKIPHEADERS-$(CONFIG_QSV) += hwcontext_qsv.h - SKIPHEADERS-$(CONFIG_OPENCL) += hwcontext_opencl.h -+SKIPHEADERS-$(CONFIG-RPI) += rpi_sand_fn_pw.h - SKIPHEADERS-$(CONFIG_VAAPI) += hwcontext_vaapi.h - SKIPHEADERS-$(CONFIG_VIDEOTOOLBOX) += hwcontext_videotoolbox.h - SKIPHEADERS-$(CONFIG_VDPAU) += hwcontext_vdpau.h -diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile -index 5da44b0542..b74b7c4e2f 100644 ---- a/libavutil/arm/Makefile -+++ b/libavutil/arm/Makefile -@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o \ +diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile +index 1827a4e134..3c765a5eb1 100644 +--- a/tests/checkasm/Makefile ++++ b/tests/checkasm/Makefile +@@ -9,8 +9,10 @@ AVCODECOBJS-$(CONFIG_G722DSP) += g722dsp.o + AVCODECOBJS-$(CONFIG_H264DSP) += h264dsp.o + AVCODECOBJS-$(CONFIG_H264PRED) += h264pred.o + AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o ++AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o + AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o + AVCODECOBJS-$(CONFIG_LLVIDENCDSP) += llviddspenc.o ++AVCODECOBJS-$(CONFIG_VC1DSP) += vc1dsp.o + AVCODECOBJS-$(CONFIG_VP8DSP) += vp8dsp.o + AVCODECOBJS-$(CONFIG_VIDEODSP) += videodsp.o - NEON-OBJS += arm/float_dsp_init_neon.o \ - arm/float_dsp_neon.o \ -+ arm/rpi_sand_neon.o \ -diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S -new file mode 100644 -index 0000000000..80890fe985 ---- /dev/null -+++ b/libavutil/arm/rpi_sand_neon.S -@@ -0,0 +1,768 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox -+*/ -+ -+#include "libavutil/arm/asm.S" -+ -+ -+@ General notes: -+@ Having done some timing on this in sand8->y8 (Pi4) -+@ vst1 (680fps) is a bit faster than vstm (660fps) -+@ vldm (680fps) is noticably faster than vld1 (480fps) -+@ (or it might be that a mix is what is required) -+@ -+@ At least on a Pi4 it is no more expensive to have a single auto-inc register -+@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted -+@ the latter was better) -+@ -+@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless -+@ the memory is uncached. -+@ As these are Sand -> planar we can assume that src is going to be aligned but -+@ it is possible that dest isn't (converting to .yuv or other packed format). -+@ Luckily vst1 is faster than vstm :-) so all is well -+@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4 -+@ .8 stores would let us do non-word aligned stores into uncached but it -+@ probably isn't worth it. -+ -+ -+ -+ -+@ void ff_rpi_sand128b_stripe_to_8_10( -+@ uint8_t * dest, // [r0] -+@ const uint8_t * src1, // [r1] -+@ const uint8_t * src2, // [r2] -+@ unsigned int lines); // [r3] -+ -+.macro stripe2_to_8, bit_depth -+ vpush {q4-q7} -+1: -+ vldm r1!, {q0-q7} -+ subs r3, #1 -+ vldm r2!, {q8-q15} -+ vqrshrn.u16 d0, q0, #\bit_depth - 8 -+ vqrshrn.u16 d1, q1, #\bit_depth - 8 -+ vqrshrn.u16 d2, q2, #\bit_depth - 8 -+ vqrshrn.u16 d3, q3, #\bit_depth - 8 -+ vqrshrn.u16 d4, q4, #\bit_depth - 8 -+ vqrshrn.u16 d5, q5, #\bit_depth - 8 -+ vqrshrn.u16 d6, q6, #\bit_depth - 8 -+ vqrshrn.u16 d7, q7, #\bit_depth - 8 -+ vqrshrn.u16 d8, q8, #\bit_depth - 8 -+ vqrshrn.u16 d9, q9, #\bit_depth - 8 -+ vqrshrn.u16 d10, q10, #\bit_depth - 8 -+ vqrshrn.u16 d11, q11, #\bit_depth - 8 -+ vqrshrn.u16 d12, q12, #\bit_depth - 8 -+ vqrshrn.u16 d13, q13, #\bit_depth - 8 -+ vqrshrn.u16 d14, q14, #\bit_depth - 8 -+ vqrshrn.u16 d15, q15, #\bit_depth - 8 -+ vstm r0!, {q0-q7} -+ bne 1b -+ vpop {q4-q7} -+ bx lr -+.endm -+ -+function ff_rpi_sand128b_stripe_to_8_10, export=1 -+ stripe2_to_8 10 -+endfunc -+ -+@ void ff_rpi_sand8_lines_to_planar_y8( -+@ uint8_t * dest, // [r0] -+@ unsigned int dst_stride, // [r1] -+@ const uint8_t * src, // [r2] -+@ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+@ unsigned int src_stride2, // [sp, #0] -> r3 -+@ unsigned int _x, // [sp, #4] Ignored - 0 -+@ unsigned int y, // [sp, #8] (r7 in prefix) -+@ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+@ unsigned int h); // [sp, #16] -> r7 -+@ -+@ Assumes that we are starting on a stripe boundary and that overreading -+@ within the stripe is OK. However it does respect the dest size for writing -+ -+function ff_rpi_sand8_lines_to_planar_y8, export=1 -+ push {r4-r8, lr} @ +24 L -+ ldr r3, [sp, #24] -+ ldr r6, [sp, #36] -+ ldr r7, [sp, #32] @ y -+ lsl r3, #7 -+ sub r1, r6 -+ add r8, r2, r7, lsl #7 -+ ldr r7, [sp, #40] -+ -+10: -+ mov r2, r8 -+ add r4, r0, #24 -+ mov r5, r6 -+ mov lr, #0 -+1: -+ vldm r2, {q8-q15} -+ add r2, r3 -+ subs r5, #128 -+ blt 2f -+ vst1.8 {d16, d17, d18, d19}, [r0]! -+ vst1.8 {d20, d21, d22, d23}, [r0]! -+ vst1.8 {d24, d25, d26, d27}, [r0]! -+ vst1.8 {d28, d29, d30, d31}, [r0]! -+ bne 1b -+11: -+ subs r7, #1 -+ add r0, r1 -+ add r8, #128 -+ bne 10b -+ -+ pop {r4-r8, pc} -+ -+@ Partial final write -+2: -+ cmp r5, #64-128 -+ blt 1f -+ vst1.8 {d16, d17, d18, d19}, [r0]! -+ vst1.8 {d20, d21, d22, d23}, [r0]! -+ beq 11b -+ vmov q8, q12 -+ vmov q9, q13 -+ sub r5, #64 -+ vmov q10, q14 -+ vmov q11, q15 -+1: -+ cmp r5, #32-128 -+ blt 1f -+ vst1.8 {d16, d17, d18, d19}, [r0]! -+ beq 11b -+ vmov q8, q10 -+ sub r5, #32 -+ vmov q9, q11 -+1: -+ cmp r5, #16-128 -+ blt 1f -+ vst1.8 {d16, d17}, [r0]! -+ beq 11b -+ sub r5, #16 -+ vmov q8, q9 -+1: -+ cmp r5, #8-128 -+ blt 1f -+ vst1.8 {d16}, [r0]! -+ beq 11b -+ sub r5, #8 -+ vmov d16, d17 -+1: -+ cmp r5, #4-128 -+ blt 1f -+ vst1.32 {d16[0]}, [r0]! -+ beq 11b -+ sub r5, #4 -+ vshr.u64 d16, #32 -+1: -+ cmp r5, #2-128 -+ blt 1f -+ vst1.16 {d16[0]}, [r0]! -+ beq 11b -+ vst1.8 {d16[2]}, [r0]! -+ b 11b -+1: -+ vst1.8 {d16[0]}, [r0]! -+ b 11b -+endfunc -+ -+@ void ff_rpi_sand8_lines_to_planar_c8( -+@ uint8_t * dst_u, // [r0] -+@ unsigned int dst_stride_u, // [r1] -+@ uint8_t * dst_v, // [r2] -+@ unsigned int dst_stride_v, // [r3] -+@ const uint8_t * src, // [sp, #0] -> r4, r5 -+@ unsigned int stride1, // [sp, #4] 128 -+@ unsigned int stride2, // [sp, #8] -> r8 -+@ unsigned int _x, // [sp, #12] 0 -+@ unsigned int y, // [sp, #16] (r7 in prefix) -+@ unsigned int _w, // [sp, #20] -> r12, r6 -+@ unsigned int h); // [sp, #24] -> r7 -+@ -+@ Assumes that we are starting on a stripe boundary and that overreading -+@ within the stripe is OK. However it does respect the dest size for writing -+ -+function ff_rpi_sand8_lines_to_planar_c8, export=1 -+ push {r4-r8, lr} @ +24 -+ -+ ldr r5, [sp, #24] -+ ldr r8, [sp, #32] -+ ldr r7, [sp, #40] -+ ldr r6, [sp, #44] -+ lsl r8, #7 -+ add r5, r5, r7, lsl #7 -+ sub r1, r1, r6 -+ sub r3, r3, r6 -+ ldr r7, [sp, #48] -+ vpush {q4-q7} -+ -+10: -+ mov r4, r5 -+ mov r12, r6 -+1: -+ subs r12, #64 -+ vldm r4, {q0-q7} -+ add r4, r8 -+ it gt -+ vldmgt r4, {q8-q15} -+ add r4, r8 -+ -+ vuzp.8 q0, q1 -+ vuzp.8 q2, q3 -+ vuzp.8 q4, q5 -+ vuzp.8 q6, q7 -+ -+ vuzp.8 q8, q9 -+ vuzp.8 q10, q11 -+ vuzp.8 q12, q13 -+ vuzp.8 q14, q15 -+ subs r12, #64 -+ -+ @ Rearrange regs so we can use vst1 with 4 regs -+ vswp q1, q2 -+ vswp q5, q6 -+ vswp q9, q10 -+ vswp q13, q14 -+ blt 2f -+ -+ vst1.8 {d0, d1, d2, d3 }, [r0]! -+ vst1.8 {d8, d9, d10, d11}, [r0]! -+ vst1.8 {d16, d17, d18, d19}, [r0]! -+ vst1.8 {d24, d25, d26, d27}, [r0]! -+ -+ vst1.8 {d4, d5, d6, d7 }, [r2]! -+ vst1.8 {d12, d13, d14, d15}, [r2]! -+ vst1.8 {d20, d21, d22, d23}, [r2]! -+ vst1.8 {d28, d29, d30, d31}, [r2]! -+ bne 1b -+11: -+ subs r7, #1 -+ add r5, #128 -+ add r0, r1 -+ add r2, r3 -+ bne 10b -+ vpop {q4-q7} -+ pop {r4-r8,pc} -+ -+2: -+ cmp r12, #64-128 -+ blt 1f -+ vst1.8 {d0, d1, d2, d3 }, [r0]! -+ vst1.8 {d8, d9, d10, d11}, [r0]! -+ vst1.8 {d4, d5, d6, d7 }, [r2]! -+ vst1.8 {d12, d13, d14, d15}, [r2]! -+ beq 11b -+ sub r12, #64 -+ vmov q0, q8 -+ vmov q1, q9 -+ vmov q2, q10 -+ vmov q3, q11 -+ vmov q4, q12 -+ vmov q5, q13 -+ vmov q6, q14 -+ vmov q7, q15 -+1: -+ cmp r12, #32-128 -+ blt 1f -+ vst1.8 {d0, d1, d2, d3 }, [r0]! -+ vst1.8 {d4, d5, d6, d7 }, [r2]! -+ beq 11b -+ sub r12, #32 -+ vmov q0, q4 -+ vmov q1, q5 -+ vmov q2, q6 -+ vmov q3, q7 -+1: -+ cmp r12, #16-128 -+ blt 1f -+ vst1.8 {d0, d1 }, [r0]! -+ vst1.8 {d4, d5 }, [r2]! -+ beq 11b -+ sub r12, #16 -+ vmov q0, q1 -+ vmov q2, q3 -+1: -+ cmp r12, #8-128 -+ blt 1f -+ vst1.8 {d0}, [r0]! -+ vst1.8 {d4}, [r2]! -+ beq 11b -+ sub r12, #8 -+ vmov d0, d1 -+ vmov d4, d5 -+1: -+ cmp r12, #4-128 -+ blt 1f -+ vst1.32 {d0[0]}, [r0]! -+ vst1.32 {d4[0]}, [r2]! -+ beq 11b -+ sub r12, #4 -+ vmov s0, s1 -+ vmov s8, s9 -+1: -+ cmp r12, #2-128 -+ blt 1f -+ vst1.16 {d0[0]}, [r0]! -+ vst1.16 {d4[0]}, [r2]! -+ beq 11b -+ vst1.8 {d0[2]}, [r0]! -+ vst1.8 {d4[2]}, [r2]! -+ b 11b -+1: -+ vst1.8 {d0[0]}, [r0]! -+ vst1.8 {d4[0]}, [r2]! -+ b 11b -+endfunc -+ -+ -+ -+@ void ff_rpi_sand30_lines_to_planar_y16( -+@ uint8_t * dest, // [r0] -+@ unsigned int dst_stride, // [r1] -+@ const uint8_t * src, // [r2] -+@ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+@ unsigned int src_stride2, // [sp, #0] -> r3 -+@ unsigned int _x, // [sp, #4] Ignored - 0 -+@ unsigned int y, // [sp, #8] (r7 in prefix) -+@ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+@ unsigned int h); // [sp, #16] -> r7 -+@ -+@ Assumes that we are starting on a stripe boundary and that overreading -+@ within the stripe is OK. However it does respect the dest size for writing -+ -+function ff_rpi_sand30_lines_to_planar_y16, export=1 -+ push {r4-r8, lr} @ +24 -+ ldr r3, [sp, #24] -+ ldr r6, [sp, #36] -+ ldr r7, [sp, #32] @ y -+ mov r12, #48 -+ vmov.u16 q15, #0x3ff -+ sub r3, #1 -+ lsl r3, #7 -+ sub r1, r1, r6, lsl #1 -+ add r8, r2, r7, lsl #7 -+ ldr r7, [sp, #40] -+ -+10: -+ mov r2, r8 -+ add r4, r0, #24 -+ mov r5, r6 -+ mov lr, #0 -+1: -+ vldm r2!, {q10-q13} -+ add lr, #64 -+ -+ vshr.u32 q14, q10, #20 @ Cannot vshrn.u32 #20! -+ ands lr, #127 -+ vshrn.u32 d2, q10, #10 -+ vmovn.u32 d0, q10 -+ vmovn.u32 d4, q14 -+ -+ vshr.u32 q14, q11, #20 -+ it eq -+ addeq r2, r3 -+ vshrn.u32 d3, q11, #10 -+ vmovn.u32 d1, q11 -+ vmovn.u32 d5, q14 -+ -+ subs r5, #48 -+ vand q0, q15 -+ vand q1, q15 -+ vand q2, q15 -+ -+ vshr.u32 q14, q12, #20 -+ vshrn.u32 d18, q12, #10 -+ vmovn.u32 d16, q12 -+ vmovn.u32 d20, q14 -+ -+ vshr.u32 q14, q13, #20 -+ vshrn.u32 d19, q13, #10 -+ vmovn.u32 d17, q13 -+ vmovn.u32 d21, q14 -+ -+ vand q8, q15 -+ vand q9, q15 -+ vand q10, q15 -+ blt 2f -+ -+ vst3.16 {d0, d2, d4}, [r0], r12 -+ vst3.16 {d1, d3, d5}, [r4], r12 -+ vst3.16 {d16, d18, d20}, [r0], r12 -+ vst3.16 {d17, d19, d21}, [r4], r12 -+ -+ bne 1b -+ -+11: -+ subs r7, #1 -+ add r0, r1 -+ add r8, #128 -+ bne 10b -+ -+ pop {r4-r8, pc} -+ -+@ Partial final write -+2: -+ cmp r5, #24-48 -+ blt 1f -+ vst3.16 {d0, d2, d4}, [r0], r12 -+ vst3.16 {d1, d3, d5}, [r4] -+ beq 11b -+ vmov q0, q8 -+ sub r5, #24 -+ vmov q1, q9 -+ vmov q2, q10 -+1: -+ cmp r5, #12-48 -+ blt 1f -+ vst3.16 {d0, d2, d4}, [r0]! -+ beq 11b -+ vmov d0, d1 -+ sub r5, #12 -+ vmov d2, d3 -+ vmov d4, d5 -+1: -+ cmp r5, #6-48 -+ add r4, r0, #6 @ avoid [r0]! on sequential instructions -+ blt 1f -+ vst3.16 {d0[0], d2[0], d4[0]}, [r0] -+ vst3.16 {d0[1], d2[1], d4[1]}, [r4] -+ add r0, #12 -+ beq 11b -+ vmov s0, s1 -+ sub r5, #6 -+ vmov s4, s5 -+ vmov s8, s9 -+1: -+ cmp r5, #3-48 -+ blt 1f -+ vst3.16 {d0[0], d2[0], d4[0]}, [r0]! -+ beq 11b -+ sub r5, #3 -+ vshr.u32 d0, #16 -+ vshr.u32 d2, #16 -+1: -+ cmp r5, #2-48 -+ blt 1f -+ vst2.16 {d0[0], d2[0]}, [r0]! -+ b 11b -+1: -+ vst1.16 {d0[0]}, [r0]! -+ b 11b -+ -+endfunc -+ -+ -+@ void ff_rpi_sand30_lines_to_planar_c16( -+@ uint8_t * dst_u, // [r0] -+@ unsigned int dst_stride_u, // [r1] -+@ uint8_t * dst_v, // [r2] -+@ unsigned int dst_stride_v, // [r3] -+@ const uint8_t * src, // [sp, #0] -> r4, r5 -+@ unsigned int stride1, // [sp, #4] 128 -+@ unsigned int stride2, // [sp, #8] -> r8 -+@ unsigned int _x, // [sp, #12] 0 -+@ unsigned int y, // [sp, #16] (r7 in prefix) -+@ unsigned int _w, // [sp, #20] -> r6, r9 -+@ unsigned int h); // [sp, #24] -> r7 -+@ -+@ Assumes that we are starting on a stripe boundary and that overreading -+@ within the stripe is OK. However it does respect the dest size for writing -+ -+function ff_rpi_sand30_lines_to_planar_c16, export=1 -+ push {r4-r10, lr} @ +32 -+ ldr r5, [sp, #32] -+ ldr r8, [sp, #40] -+ ldr r7, [sp, #48] -+ ldr r9, [sp, #52] -+ mov r12, #48 -+ vmov.u16 q15, #0x3ff -+ sub r8, #1 -+ lsl r8, #7 -+ add r5, r5, r7, lsl #7 -+ sub r1, r1, r9, lsl #1 -+ sub r3, r3, r9, lsl #1 -+ ldr r7, [sp, #56] -+10: -+ mov lr, #0 -+ mov r4, r5 -+ mov r6, r9 -+1: -+ vldm r4!, {q0-q3} -+ add lr, #64 -+ -+ @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2 -+ vshr.u32 q14, q0, #20 -+ vshrn.u32 d16, q0, #10 -+ vmovn.u32 d18, q0 -+ ands lr, #127 -+ vmovn.u32 d20, q14 -+ -+ vshr.u32 q14, q1, #20 -+ vshrn.u32 d17, q1, #10 -+ vmovn.u32 d19, q1 -+ vmovn.u32 d21, q14 -+ -+ vshr.u32 q14, q2, #20 -+ vshrn.u32 d22, q2, #10 -+ vmovn.u32 d24, q2 -+ vmovn.u32 d26, q14 -+ -+ vshr.u32 q14, q3, #20 -+ vshrn.u32 d23, q3, #10 -+ vmovn.u32 d25, q3 -+ add r10, r0, #24 -+ vmovn.u32 d27, q14 -+ -+ it eq -+ addeq r4, r8 -+ vuzp.16 q8, q11 -+ vuzp.16 q9, q12 -+ vuzp.16 q10, q13 -+ -+ @ q8 V0, V3,.. -> q0 -+ @ q9 U0, U3... -+ @ q10 U1, U4... -+ @ q11 U2, U5,.. -+ @ q12 V1, V4,.. -> q1 -+ @ q13 V2, V5,.. -> q2 -+ -+ subs r6, #24 -+ vand q11, q15 -+ vand q9, q15 -+ vand q10, q15 -+ vand q0, q8, q15 -+ vand q1, q12, q15 -+ vand q2, q13, q15 -+ -+ blt 2f -+ -+ vst3.16 {d18, d20, d22}, [r0], r12 -+ vst3.16 {d19, d21, d23}, [r10] -+ add r10, r2, #24 -+ vst3.16 {d0, d2, d4}, [r2], r12 -+ vst3.16 {d1, d3, d5}, [r10] -+ -+ bne 1b -+ -+11: -+ subs r7, #1 -+ add r5, #128 -+ add r0, r1 -+ add r2, r3 -+ bne 10b -+ -+ pop {r4-r10, pc} -+ -+@ Partial final write -+2: -+ cmp r6, #-12 -+ blt 1f -+ vst3.16 {d18, d20, d22}, [r0]! -+ vst3.16 {d0, d2, d4}, [r2]! -+ beq 11b -+ vmov d18, d19 -+ vmov d20, d21 -+ vmov d22, d23 -+ sub r6, #12 -+ vmov d0, d1 -+ vmov d2, d3 -+ vmov d4, d5 -+1: -+ cmp r6, #-18 -+ @ Rezip here as it makes the remaining tail handling easier -+ vzip.16 d0, d18 -+ vzip.16 d2, d20 -+ vzip.16 d4, d22 -+ blt 1f -+ vst3.16 {d0[1], d2[1], d4[1]}, [r0]! -+ vst3.16 {d0[0], d2[0], d4[0]}, [r2]! -+ vst3.16 {d0[3], d2[3], d4[3]}, [r0]! -+ vst3.16 {d0[2], d2[2], d4[2]}, [r2]! -+ beq 11b -+ vmov d0, d18 -+ vmov d2, d20 -+ sub r6, #6 -+ vmov d4, d22 -+1: -+ cmp r6, #-21 -+ blt 1f -+ vst3.16 {d0[1], d2[1], d4[1]}, [r0]! -+ vst3.16 {d0[0], d2[0], d4[0]}, [r2]! -+ beq 11b -+ vmov s4, s5 -+ sub r6, #3 -+ vmov s0, s1 -+1: -+ cmp r6, #-22 -+ blt 1f -+ vst2.16 {d0[1], d2[1]}, [r0]! -+ vst2.16 {d0[0], d2[0]}, [r2]! -+ b 11b -+1: -+ vst1.16 {d0[1]}, [r0]! -+ vst1.16 {d0[0]}, [r2]! -+ b 11b -+ -+endfunc -+ -+@ void ff_rpi_sand30_lines_to_planar_p010( -+@ uint8_t * dest, // [r0] -+@ unsigned int dst_stride, // [r1] -+@ const uint8_t * src, // [r2] -+@ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+@ unsigned int src_stride2, // [sp, #0] -> r3 -+@ unsigned int _x, // [sp, #4] Ignored - 0 -+@ unsigned int y, // [sp, #8] (r7 in prefix) -+@ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+@ unsigned int h); // [sp, #16] -> r7 -+@ -+@ Assumes that we are starting on a stripe boundary and that overreading -+@ within the stripe is OK. However it does respect the dest size for writing -+ -+function ff_rpi_sand30_lines_to_planar_p010, export=1 -+ push {r4-r8, lr} @ +24 -+ ldr r3, [sp, #24] -+ ldr r6, [sp, #36] -+ ldr r7, [sp, #32] @ y -+ mov r12, #48 -+ vmov.u16 q15, #0xffc0 -+ sub r3, #1 -+ lsl r3, #7 -+ sub r1, r1, r6, lsl #1 -+ add r8, r2, r7, lsl #7 -+ ldr r7, [sp, #40] -+ -+10: -+ mov r2, r8 -+ add r4, r0, #24 -+ mov r5, r6 -+ mov lr, #0 -+1: -+ vldm r2!, {q10-q13} -+ add lr, #64 -+ -+ vshl.u32 q14, q10, #6 -+ ands lr, #127 -+ vshrn.u32 d4, q10, #14 -+ vshrn.u32 d2, q10, #4 -+ vmovn.u32 d0, q14 -+ -+ vshl.u32 q14, q11, #6 -+ it eq -+ addeq r2, r3 -+ vshrn.u32 d5, q11, #14 -+ vshrn.u32 d3, q11, #4 -+ vmovn.u32 d1, q14 -+ -+ subs r5, #48 -+ vand q2, q15 -+ vand q1, q15 -+ vand q0, q15 -+ -+ vshl.u32 q14, q12, #6 -+ vshrn.u32 d20, q12, #14 -+ vshrn.u32 d18, q12, #4 -+ vmovn.u32 d16, q14 -+ -+ vshl.u32 q14, q13, #6 -+ vshrn.u32 d21, q13, #14 -+ vshrn.u32 d19, q13, #4 -+ vmovn.u32 d17, q14 -+ -+ vand q10, q15 -+ vand q9, q15 -+ vand q8, q15 -+ blt 2f -+ -+ vst3.16 {d0, d2, d4}, [r0], r12 -+ vst3.16 {d1, d3, d5}, [r4], r12 -+ vst3.16 {d16, d18, d20}, [r0], r12 -+ vst3.16 {d17, d19, d21}, [r4], r12 -+ -+ bne 1b -+ -+11: -+ subs r7, #1 -+ add r0, r1 -+ add r8, #128 -+ bne 10b -+ -+ pop {r4-r8, pc} -+ -+@ Partial final write -+2: -+ cmp r5, #24-48 -+ blt 1f -+ vst3.16 {d0, d2, d4}, [r0], r12 -+ vst3.16 {d1, d3, d5}, [r4] -+ beq 11b -+ vmov q0, q8 -+ sub r5, #24 -+ vmov q1, q9 -+ vmov q2, q10 -+1: -+ cmp r5, #12-48 -+ blt 1f -+ vst3.16 {d0, d2, d4}, [r0]! -+ beq 11b -+ vmov d0, d1 -+ sub r5, #12 -+ vmov d2, d3 -+ vmov d4, d5 -+1: -+ cmp r5, #6-48 -+ add r4, r0, #6 @ avoid [r0]! on sequential instructions -+ blt 1f -+ vst3.16 {d0[0], d2[0], d4[0]}, [r0] -+ vst3.16 {d0[1], d2[1], d4[1]}, [r4] -+ add r0, #12 -+ beq 11b -+ vmov s0, s1 -+ sub r5, #6 -+ vmov s4, s5 -+ vmov s8, s9 -+1: -+ cmp r5, #3-48 -+ blt 1f -+ vst3.16 {d0[0], d2[0], d4[0]}, [r0]! -+ beq 11b -+ sub r5, #3 -+ vshr.u32 d0, #16 -+ vshr.u32 d2, #16 -+1: -+ cmp r5, #2-48 -+ blt 1f -+ vst2.16 {d0[0], d2[0]}, [r0]! -+ b 11b -+1: -+ vst1.16 {d0[0]}, [r0]! -+ b 11b -+ -+endfunc -+ -+ -+ -diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h -new file mode 100644 -index 0000000000..447f367bea ---- /dev/null -+++ b/libavutil/arm/rpi_sand_neon.h -@@ -0,0 +1,99 @@ -+/* -+Copyright (c) 2020 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox -+*/ -+ -+#ifndef AVUTIL_ARM_SAND_NEON_H -+#define AVUTIL_ARM_SAND_NEON_H -+ -+void ff_rpi_sand128b_stripe_to_8_10( -+ uint8_t * dest, // [r0] -+ const uint8_t * src1, // [r1] -+ const uint8_t * src2, // [r2] -+ unsigned int lines); // [r3] -+ -+void ff_rpi_sand8_lines_to_planar_y8( -+ uint8_t * dest, // [r0] -+ unsigned int dst_stride, // [r1] -+ const uint8_t * src, // [r2] -+ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+ unsigned int src_stride2, // [sp, #0] -> r3 -+ unsigned int _x, // [sp, #4] Ignored - 0 -+ unsigned int y, // [sp, #8] (r7 in prefix) -+ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+ unsigned int h); // [sp, #16] -> r7 -+ -+void ff_rpi_sand8_lines_to_planar_c8( -+ uint8_t * dst_u, // [r0] -+ unsigned int dst_stride_u, // [r1] -+ uint8_t * dst_v, // [r2] -+ unsigned int dst_stride_v, // [r3] -+ const uint8_t * src, // [sp, #0] -> r4, r5 -+ unsigned int stride1, // [sp, #4] 128 -+ unsigned int stride2, // [sp, #8] -> r8 -+ unsigned int _x, // [sp, #12] 0 -+ unsigned int y, // [sp, #16] (r7 in prefix) -+ unsigned int _w, // [sp, #20] -> r12, r6 -+ unsigned int h); // [sp, #24] -> r7 -+ -+void ff_rpi_sand30_lines_to_planar_y16( -+ uint8_t * dest, // [r0] -+ unsigned int dst_stride, // [r1] -+ const uint8_t * src, // [r2] -+ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+ unsigned int src_stride2, // [sp, #0] -> r3 -+ unsigned int _x, // [sp, #4] Ignored - 0 -+ unsigned int y, // [sp, #8] (r7 in prefix) -+ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+ unsigned int h); // [sp, #16] -> r7 -+ -+void ff_rpi_sand30_lines_to_planar_c16( -+ uint8_t * dst_u, // [r0] -+ unsigned int dst_stride_u, // [r1] -+ uint8_t * dst_v, // [r2] -+ unsigned int dst_stride_v, // [r3] -+ const uint8_t * src, // [sp, #0] -> r4, r5 -+ unsigned int stride1, // [sp, #4] 128 -+ unsigned int stride2, // [sp, #8] -> r8 -+ unsigned int _x, // [sp, #12] 0 -+ unsigned int y, // [sp, #16] (r7 in prefix) -+ unsigned int _w, // [sp, #20] -> r6, r9 -+ unsigned int h); // [sp, #24] -> r7 -+ -+void ff_rpi_sand30_lines_to_planar_p010( -+ uint8_t * dest, // [r0] -+ unsigned int dst_stride, // [r1] -+ const uint8_t * src, // [r2] -+ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+ unsigned int src_stride2, // [sp, #0] -> r3 -+ unsigned int _x, // [sp, #4] Ignored - 0 -+ unsigned int y, // [sp, #8] (r7 in prefix) -+ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+ unsigned int h); // [sp, #16] -> r7 -+ -+#endif // AVUTIL_ARM_SAND_NEON_H -+ -diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c -index 62a2ae08d9..cb73521ea7 100644 ---- a/libavutil/pixdesc.c -+++ b/libavutil/pixdesc.c -@@ -2717,6 +2717,50 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = { - .flags = AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_FLOAT | - AV_PIX_FMT_FLAG_ALPHA, - }, -+ [AV_PIX_FMT_SAND128] = { -+ .name = "sand128", -+ .nb_components = 3, -+ .log2_chroma_w = 1, -+ .log2_chroma_h = 1, -+ .comp = { -+ { 0, 1, 0, 0, 8 }, /* Y */ -+ { 1, 2, 0, 0, 8 }, /* U */ -+ { 1, 2, 1, 0, 8 }, /* V */ -+ }, -+ .flags = 0, -+ }, -+ [AV_PIX_FMT_SAND64_10] = { -+ .name = "sand64_10", -+ .nb_components = 3, -+ .log2_chroma_w = 1, -+ .log2_chroma_h = 1, -+ .comp = { -+ { 0, 2, 0, 0, 10 }, /* Y */ -+ { 1, 4, 0, 0, 10 }, /* U */ -+ { 1, 4, 2, 0, 10 }, /* V */ -+ }, -+ .flags = 0, -+ }, -+ [AV_PIX_FMT_SAND64_16] = { -+ .name = "sand64_16", -+ .nb_components = 3, -+ .log2_chroma_w = 1, -+ .log2_chroma_h = 1, -+ .comp = { -+ { 0, 2, 0, 0, 16 }, /* Y */ -+ { 1, 4, 0, 0, 16 }, /* U */ -+ { 1, 4, 2, 0, 16 }, /* V */ -+ }, -+ .flags = 0, -+ }, -+ [AV_PIX_FMT_RPI4_8] = { -+ .name = "rpi4_8", -+ .flags = AV_PIX_FMT_FLAG_HWACCEL, -+ }, -+ [AV_PIX_FMT_RPI4_10] = { -+ .name = "rpi4_10", -+ .flags = AV_PIX_FMT_FLAG_HWACCEL, -+ }, +@@ -35,6 +37,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) + # libavfilter tests + AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o + AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o ++AVFILTEROBJS-$(CONFIG_BWDIF_FILTER) += vf_bwdif.o + AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o + AVFILTEROBJS-$(CONFIG_EQ_FILTER) += vf_eq.o + AVFILTEROBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o +@@ -52,8 +55,9 @@ CHECKASMOBJS-$(CONFIG_SWSCALE) += $(SWSCALEOBJS) + # libavutil tests + AVUTILOBJS += fixed_dsp.o + AVUTILOBJS += float_dsp.o ++AVUTILOBJS-$(CONFIG_SAND) += rpi_sand.o + +-CHECKASMOBJS-$(CONFIG_AVUTIL) += $(AVUTILOBJS) ++CHECKASMOBJS-$(CONFIG_AVUTIL) += $(AVUTILOBJS) $(AVUTILOBJS-yes) + + CHECKASMOBJS-$(ARCH_AARCH64) += aarch64/checkasm.o + CHECKASMOBJS-$(HAVE_ARMV5TE_EXTERNAL) += arm/checkasm.o +diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c +index 8338e8ff58..c1ee09c72e 100644 +--- a/tests/checkasm/checkasm.c ++++ b/tests/checkasm/checkasm.c +@@ -131,6 +131,9 @@ static const struct { + #if CONFIG_HUFFYUV_DECODER + { "huffyuvdsp", checkasm_check_huffyuvdsp }, + #endif ++ #if CONFIG_IDCTDSP ++ { "idctdsp", checkasm_check_idctdsp }, ++ #endif + #if CONFIG_JPEG2000_DECODER + { "jpeg2000dsp", checkasm_check_jpeg2000dsp }, + #endif +@@ -155,6 +158,9 @@ static const struct { + #if CONFIG_V210_ENCODER + { "v210enc", checkasm_check_v210enc }, + #endif ++ #if CONFIG_VC1DSP ++ { "vc1dsp", checkasm_check_vc1dsp }, ++ #endif + #if CONFIG_VP8DSP + { "vp8dsp", checkasm_check_vp8dsp }, + #endif +@@ -172,6 +178,9 @@ static const struct { + #if CONFIG_BLEND_FILTER + { "vf_blend", checkasm_check_blend }, + #endif ++ #if CONFIG_BWDIF_FILTER ++ { "vf_bwdif", checkasm_check_vf_bwdif }, ++ #endif + #if CONFIG_COLORSPACE_FILTER + { "vf_colorspace", checkasm_check_colorspace }, + #endif +@@ -198,6 +207,9 @@ static const struct { + #if CONFIG_AVUTIL + { "fixed_dsp", checkasm_check_fixed_dsp }, + { "float_dsp", checkasm_check_float_dsp }, ++ #if CONFIG_SAND ++ { "rpi_sand", checkasm_check_rpi_sand }, ++ #endif + #endif + { NULL } }; - - static const char * const color_range_names[] = { -diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h -index 37c2c79e01..22f70007c3 100644 ---- a/libavutil/pixfmt.h -+++ b/libavutil/pixfmt.h -@@ -377,6 +377,12 @@ enum AVPixelFormat { - - AV_PIX_FMT_Y210BE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian - AV_PIX_FMT_Y210LE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian -+// RPI - not on ifdef so can be got at by calling progs -+ AV_PIX_FMT_SAND128, ///< 4:2:0 8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding -+ AV_PIX_FMT_SAND64_10, ///< 4:2:0 10-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding -+ AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding -+ AV_PIX_FMT_RPI4_8, -+ AV_PIX_FMT_RPI4_10, - - AV_PIX_FMT_X2RGB10LE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), little-endian, X=unused/undefined - AV_PIX_FMT_X2RGB10BE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), big-endian, X=unused/undefined -diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h +diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h +index ef6645e3a2..02d3642836 100644 +--- a/tests/checkasm/checkasm.h ++++ b/tests/checkasm/checkasm.h +@@ -70,12 +70,14 @@ void checkasm_check_hevc_epel_bi(void); + void checkasm_check_hevc_epel_bi_w(void); + void checkasm_check_hevc_sao(void); + void checkasm_check_huffyuvdsp(void); ++void checkasm_check_idctdsp(void); + void checkasm_check_jpeg2000dsp(void); + void checkasm_check_llviddsp(void); + void checkasm_check_llviddspenc(void); + void checkasm_check_nlmeans(void); + void checkasm_check_opusdsp(void); + void checkasm_check_pixblockdsp(void); ++void checkasm_check_rpi_sand(void); + void checkasm_check_sbrdsp(void); + void checkasm_check_synth_filter(void); + void checkasm_check_sw_rgb(void); +@@ -83,6 +85,8 @@ void checkasm_check_sw_scale(void); + void checkasm_check_utvideodsp(void); + void checkasm_check_v210dec(void); + void checkasm_check_v210enc(void); ++void checkasm_check_vc1dsp(void); ++void checkasm_check_vf_bwdif(void); + void checkasm_check_vf_eq(void); + void checkasm_check_vf_gblur(void); + void checkasm_check_vf_hflip(void); +diff --git a/tests/checkasm/idctdsp.c b/tests/checkasm/idctdsp.c new file mode 100644 -index 0000000000..0324f6826d +index 0000000000..02724536a7 --- /dev/null -+++ b/libavutil/rpi_sand_fn_pw.h -@@ -0,0 +1,227 @@ ++++ b/tests/checkasm/idctdsp.c +@@ -0,0 +1,98 @@ +/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox -+*/ -+ -+// * Included twice from rpi_sand_fn with different PW -+ -+#define STRCAT(x,y) x##y -+ -+#if PW == 1 -+#define pixel uint8_t -+#define FUNC(f) STRCAT(f, 8) -+#elif PW == 2 -+#define pixel uint16_t -+#define FUNC(f) STRCAT(f, 16) -+#else -+#error Unexpected PW -+#endif -+ -+// Fetches a single patch - offscreen fixup not done here -+// w <= stride1 -+// unclipped -+void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h) -+{ -+ const unsigned int x = _x; -+ const unsigned int w = _w; -+ const unsigned int mask = stride1 - 1; -+ -+#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64) -+ if (_x == 0) { -+ ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride, -+ src, stride1, stride2, _x, y, _w, h); -+ return; -+ } -+#endif -+ -+ if ((x & ~mask) == ((x + w) & ~mask)) { -+ // All in one sand stripe -+ const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; -+ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) { -+ memcpy(dst, p, w); -+ } -+ } -+ else -+ { -+ // Two+ stripe -+ const unsigned int sstride = stride1 * stride2; -+ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; -+ const uint8_t * p2 = p1 + sstride - (x & mask); -+ const unsigned int w1 = stride1 - (x & mask); -+ const unsigned int w3 = (x + w) & mask; -+ const unsigned int w2 = w - (w1 + w3); -+ -+ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) { -+ unsigned int j; -+ const uint8_t * p = p2; -+ uint8_t * d = dst; -+ memcpy(d, p1, w1); -+ d += w1; -+ for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) { -+ memcpy(d, p, stride1); -+ } -+ memcpy(d, p, w3); -+ } -+ } -+} -+ -+// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V) -+ -+void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u, -+ uint8_t * dst_v, const unsigned int dst_stride_v, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h) -+{ -+ const unsigned int x = _x * 2; -+ const unsigned int w = _w * 2; -+ const unsigned int mask = stride1 - 1; -+ -+#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64) -+ if (_x == 0) { -+ ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v, -+ src, stride1, stride2, _x, y, _w, h); -+ return; -+ } -+#endif -+ -+ if ((x & ~mask) == ((x + w) & ~mask)) { -+ // All in one sand stripe -+ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; -+ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) { -+ pixel * du = (pixel *)dst_u; -+ pixel * dv = (pixel *)dst_v; -+ const pixel * p = (const pixel *)p1; -+ for (unsigned int k = 0; k < w; k += 2 * PW) { -+ *du++ = *p++; -+ *dv++ = *p++; -+ } -+ } -+ } -+ else -+ { -+ // Two+ stripe -+ const unsigned int sstride = stride1 * stride2; -+ const unsigned int sstride_p = (sstride - stride1) / PW; -+ -+ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; -+ const uint8_t * p2 = p1 + sstride - (x & mask); -+ const unsigned int w1 = stride1 - (x & mask); -+ const unsigned int w3 = (x + w) & mask; -+ const unsigned int w2 = w - (w1 + w3); -+ -+ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) { -+ unsigned int j; -+ const pixel * p = (const pixel *)p1; -+ pixel * du = (pixel *)dst_u; -+ pixel * dv = (pixel *)dst_v; -+ for (unsigned int k = 0; k < w1; k += 2 * PW) { -+ *du++ = *p++; -+ *dv++ = *p++; -+ } -+ for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) { -+ for (unsigned int k = 0; k < stride1; k += 2 * PW) { -+ *du++ = *p++; -+ *dv++ = *p++; -+ } -+ } -+ for (unsigned int k = 0; k < w3; k += 2 * PW) { -+ *du++ = *p++; -+ *dv++ = *p++; -+ } -+ } -+ } -+} -+ -+void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c, -+ unsigned int stride1, unsigned int stride2, -+ const uint8_t * src_u, const unsigned int src_stride_u, -+ const uint8_t * src_v, const unsigned int src_stride_v, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h) -+{ -+ const unsigned int x = _x * 2; -+ const unsigned int w = _w * 2; -+ const unsigned int mask = stride1 - 1; -+ if ((x & ~mask) == ((x + w) & ~mask)) { -+ // All in one sand stripe -+ uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; -+ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) { -+ const pixel * su = (const pixel *)src_u; -+ const pixel * sv = (const pixel *)src_v; -+ pixel * p = (pixel *)p1; -+ for (unsigned int k = 0; k < w; k += 2 * PW) { -+ *p++ = *su++; -+ *p++ = *sv++; -+ } -+ } -+ } -+ else -+ { -+ // Two+ stripe -+ const unsigned int sstride = stride1 * stride2; -+ const unsigned int sstride_p = (sstride - stride1) / PW; -+ -+ const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; -+ const uint8_t * p2 = p1 + sstride - (x & mask); -+ const unsigned int w1 = stride1 - (x & mask); -+ const unsigned int w3 = (x + w) & mask; -+ const unsigned int w2 = w - (w1 + w3); -+ -+ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) { -+ unsigned int j; -+ const pixel * su = (const pixel *)src_u; -+ const pixel * sv = (const pixel *)src_v; -+ pixel * p = (pixel *)p1; -+ for (unsigned int k = 0; k < w1; k += 2 * PW) { -+ *p++ = *su++; -+ *p++ = *sv++; -+ } -+ for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) { -+ for (unsigned int k = 0; k < stride1; k += 2 * PW) { -+ *p++ = *su++; -+ *p++ = *sv++; -+ } -+ } -+ for (unsigned int k = 0; k < w3; k += 2 * PW) { -+ *p++ = *su++; -+ *p++ = *sv++; -+ } -+ } -+ } -+} -+ -+ -+#undef pixel -+#undef STRCAT -+#undef FUNC -+ -diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c -new file mode 100644 -index 0000000000..ed0261b02f ---- /dev/null -+++ b/libavutil/rpi_sand_fns.c -@@ -0,0 +1,353 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox -+*/ -+ -+#include "config.h" -+#include -+#include -+#include "rpi_sand_fns.h" -+#include "avassert.h" -+#include "frame.h" -+ -+#if ARCH_ARM && HAVE_NEON -+#include "arm/rpi_sand_neon.h" -+#define HAVE_SAND_ASM 1 -+#else -+#define HAVE_SAND_ASM 0 -+#endif -+ -+#define PW 1 -+#include "rpi_sand_fn_pw.h" -+#undef PW -+ -+#define PW 2 -+#include "rpi_sand_fn_pw.h" -+#undef PW -+ -+#if 1 -+// Simple round -+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) -+{ -+ const unsigned int rnd = (1 << shr) >> 1; -+ const uint16_t * src = (const uint16_t *)_src; -+ -+ for (; n != 0; --n) { -+ *dst++ = (*src++ + rnd) >> shr; -+ } -+} -+#else -+// Dithered variation -+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) -+{ -+ unsigned int rnd = (1 << shr) >> 1; -+ const unsigned int mask = ((1 << shr) - 1); -+ const uint16_t * src = (const uint16_t *)_src; -+ -+ for (; n != 0; --n) { -+ rnd = *src++ + (rnd & mask); -+ *dst++ = rnd >> shr; -+ } -+} -+#endif -+ -+// Fetches a single patch - offscreen fixup not done here -+// w <= stride1 -+// unclipped -+// _x & _w in pixels, strides in bytes -+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h) -+{ -+ const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word -+ const unsigned int xskip0 = _x - (x0 >> 2) * 3; -+ const unsigned int x1 = ((_x + _w) / 3) * 4; -+ const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3; -+ const unsigned int mask = stride1 - 1; -+ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; -+ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words -+ -+#if HAVE_SAND_ASM -+ if (_x == 0) { -+ ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); -+ return; -+ } -+#endif -+ -+ if (x0 == x1) { -+ // ******************* -+ // Partial single word xfer -+ return; -+ } -+ -+ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1) -+ { -+ unsigned int x = x0; -+ const uint32_t * p = (const uint32_t *)p0; -+ uint16_t * d = (uint16_t *)dst; -+ -+ if (xskip0 != 0) { -+ const uint32_t p3 = *p++; -+ -+ if (xskip0 == 1) -+ *d++ = (p3 >> 10) & 0x3ff; -+ *d++ = (p3 >> 20) & 0x3ff; -+ -+ if (((x += 4) & mask) == 0) -+ p += slice_inc; -+ } -+ -+ while (x != x1) { -+ const uint32_t p3 = *p++; -+ *d++ = p3 & 0x3ff; -+ *d++ = (p3 >> 10) & 0x3ff; -+ *d++ = (p3 >> 20) & 0x3ff; -+ -+ if (((x += 4) & mask) == 0) -+ p += slice_inc; -+ } -+ -+ if (xrem1 != 0) { -+ const uint32_t p3 = *p; -+ -+ *d++ = p3 & 0x3ff; -+ if (xrem1 == 2) -+ *d++ = (p3 >> 10) & 0x3ff; -+ } -+ } -+} -+ -+ -+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, -+ uint8_t * dst_v, const unsigned int dst_stride_v, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h) -+{ -+ const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word -+ const unsigned int xskip0 = _x - (x0 >> 3) * 3; -+ const unsigned int x1 = ((_x + _w) / 3) * 8; -+ const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3; -+ const unsigned int mask = stride1 - 1; -+ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; -+ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words -+ -+#if HAVE_SAND_ASM -+ if (_x == 0) { -+ ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v, -+ src, stride1, stride2, _x, y, _w, h); -+ return; -+ } -+#endif -+ -+ if (x0 == x1) { -+ // ******************* -+ // Partial single word xfer -+ return; -+ } -+ -+ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1) -+ { -+ unsigned int x = x0; -+ const uint32_t * p = (const uint32_t *)p0; -+ uint16_t * du = (uint16_t *)dst_u; -+ uint16_t * dv = (uint16_t *)dst_v; -+ -+ if (xskip0 != 0) { -+ const uint32_t p3a = *p++; -+ const uint32_t p3b = *p++; -+ -+ if (xskip0 == 1) -+ { -+ *du++ = (p3a >> 20) & 0x3ff; -+ *dv++ = (p3b >> 0) & 0x3ff; -+ } -+ *du++ = (p3b >> 10) & 0x3ff; -+ *dv++ = (p3b >> 20) & 0x3ff; -+ -+ if (((x += 8) & mask) == 0) -+ p += slice_inc; -+ } -+ -+ while (x != x1) { -+ const uint32_t p3a = *p++; -+ const uint32_t p3b = *p++; -+ -+ *du++ = p3a & 0x3ff; -+ *dv++ = (p3a >> 10) & 0x3ff; -+ *du++ = (p3a >> 20) & 0x3ff; -+ *dv++ = p3b & 0x3ff; -+ *du++ = (p3b >> 10) & 0x3ff; -+ *dv++ = (p3b >> 20) & 0x3ff; -+ -+ if (((x += 8) & mask) == 0) -+ p += slice_inc; -+ } -+ -+ if (xrem1 != 0) { -+ const uint32_t p3a = *p++; -+ const uint32_t p3b = *p++; -+ -+ *du++ = p3a & 0x3ff; -+ *dv++ = (p3a >> 10) & 0x3ff; -+ if (xrem1 == 2) -+ { -+ *du++ = (p3a >> 20) & 0x3ff; -+ *dv++ = p3b & 0x3ff; -+ } -+ } -+ } -+} -+ -+ -+// w/h in pixels -+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, -+ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, -+ unsigned int w, unsigned int h, const unsigned int shr) -+{ -+ const unsigned int n = dst_stride1 / 2; -+ unsigned int j; -+ -+ // This is true for our current layouts -+ av_assert0(dst_stride1 == src_stride1); -+ -+ // As we have the same stride1 for src & dest and src is wider than dest -+ // then if we loop on src we can always write contiguously to dest -+ // We make no effort to copy an exact width - round up to nearest src stripe -+ // as we will always have storage in dest for that -+ -+#if ARCH_ARM && HAVE_NEON -+ if (shr == 3 && src_stride1 == 128) { -+ for (j = 0; j + n < w; j += dst_stride1) { -+ uint8_t * d = dst + j * dst_stride2; -+ const uint8_t * s1 = src + j * 2 * src_stride2; -+ const uint8_t * s2 = s1 + src_stride1 * src_stride2; -+ -+ ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h); -+ } -+ } -+ else -+#endif -+ { -+ for (j = 0; j + n < w; j += dst_stride1) { -+ uint8_t * d = dst + j * dst_stride2; -+ const uint8_t * s1 = src + j * 2 * src_stride2; -+ const uint8_t * s2 = s1 + src_stride1 * src_stride2; -+ -+ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) { -+ cpy16_to_8(d, s1, n, shr); -+ cpy16_to_8(d + n, s2, n, shr); -+ } -+ } -+ } -+ -+ // Fix up a trailing dest half stripe -+ if (j < w) { -+ uint8_t * d = dst + j * dst_stride2; -+ const uint8_t * s1 = src + j * 2 * src_stride2; -+ -+ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) { -+ cpy16_to_8(d, s1, n, shr); -+ } -+ } -+} -+ -+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src) -+{ -+ const int w = av_frame_cropped_width(src); -+ const int h = av_frame_cropped_height(src); -+ const int x = src->crop_left; -+ const int y = src->crop_top; -+ -+ // We will crop as part of the conversion -+ dst->crop_top = 0; -+ dst->crop_left = 0; -+ dst->crop_bottom = 0; -+ dst->crop_right = 0; -+ -+ switch (src->format){ -+ case AV_PIX_FMT_SAND128: -+ case AV_PIX_FMT_RPI4_8: -+ switch (dst->format){ -+ case AV_PIX_FMT_YUV420P: -+ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], -+ src->data[0], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x, y, w, h); -+ av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1], -+ dst->data[2], dst->linesize[2], -+ src->data[1], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x/2, y/2, w/2, h/2); -+ break; -+ default: -+ return -1; -+ } -+ break; -+ case AV_PIX_FMT_SAND64_10: -+ switch (dst->format){ -+ case AV_PIX_FMT_YUV420P10: -+ av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0], -+ src->data[0], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x*2, y, w*2, h); -+ av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1], -+ dst->data[2], dst->linesize[2], -+ src->data[1], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x, y/2, w, h/2); -+ break; -+ default: -+ return -1; -+ } -+ break; -+ case AV_PIX_FMT_RPI4_10: -+ switch (dst->format){ -+ case AV_PIX_FMT_YUV420P10: -+ av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0], -+ src->data[0], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x, y, w, h); -+ av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1], -+ dst->data[2], dst->linesize[2], -+ src->data[1], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x/2, y/2, w/2, h/2); -+ break; -+ default: -+ return -1; -+ } -+ break; -+ default: -+ return -1; -+ } -+ -+ return av_frame_copy_props(dst, src); -+} -diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h -new file mode 100644 -index 0000000000..634b55e800 ---- /dev/null -+++ b/libavutil/rpi_sand_fns.h -@@ -0,0 +1,183 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox -+*/ -+ -+#ifndef AVUTIL_RPI_SAND_FNS -+#define AVUTIL_RPI_SAND_FNS -+ -+#include "libavutil/frame.h" -+ -+// For all these fns _x & _w are measured as coord * PW -+// For the C fns coords are in chroma pels (so luma / 2) -+// Strides are in bytes -+ -+void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+ -+void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u, -+ uint8_t * dst_v, const unsigned int dst_stride_v, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, -+ uint8_t * dst_v, const unsigned int dst_stride_v, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+ -+void av_rpi_planar_to_sand_c8(uint8_t * dst_c, -+ unsigned int stride1, unsigned int stride2, -+ const uint8_t * src_u, const unsigned int src_stride_u, -+ const uint8_t * src_v, const unsigned int src_stride_v, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+void av_rpi_planar_to_sand_c16(uint8_t * dst_c, -+ unsigned int stride1, unsigned int stride2, -+ const uint8_t * src_u, const unsigned int src_stride_u, -+ const uint8_t * src_v, const unsigned int src_stride_v, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+ -+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, -+ uint8_t * dst_v, const unsigned int dst_stride_v, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+ -+ -+// w/h in pixels -+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, -+ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, -+ unsigned int w, unsigned int h, const unsigned int shr); -+ -+ -+// dst must contain required pixel format & allocated data buffers -+// Cropping on the src buffer will be honoured and dst crop will be set to zero -+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src); -+ -+ -+static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame) -+{ -+#ifdef RPI_ZC_SAND128_ONLY -+ // If we are sure we only only support 128 byte sand formats replace the -+ // var with a constant which should allow for better optimisation -+ return 128; -+#else -+ return frame->linesize[0]; -+#endif -+} -+ -+static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame) -+{ -+ return frame->linesize[3]; -+} -+ -+ -+static inline int av_rpi_is_sand_format(const int format) -+{ -+ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10); -+} -+ -+static inline int av_rpi_is_sand_frame(const AVFrame * const frame) -+{ -+ return av_rpi_is_sand_format(frame->format); -+} -+ -+static inline int av_rpi_is_sand8_frame(const AVFrame * const frame) -+{ -+ return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8); -+} -+ -+static inline int av_rpi_is_sand16_frame(const AVFrame * const frame) -+{ -+ return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16); -+} -+ -+static inline int av_rpi_is_sand30_frame(const AVFrame * const frame) -+{ -+ return (frame->format == AV_PIX_FMT_RPI4_10); -+} -+ -+static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame) -+{ -+ return av_rpi_is_sand8_frame(frame) ? 0 : 1; -+} -+ -+// If x is measured in bytes (not pixels) then this works for sand64_16 as -+// well as sand128 - but in the general case we work that out -+ -+static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y) -+{ -+ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); -+ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); -+ const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame); -+ const unsigned int x1 = x & (stride1 - 1); -+ const unsigned int x2 = x ^ x1; -+ -+ return x1 + stride1 * y + stride2 * x2; -+} -+ -+static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c) -+{ -+ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); -+ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); -+ const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1); -+ const unsigned int x1 = x & (stride1 - 1); -+ const unsigned int x2 = x ^ x1; -+ -+ return x1 + stride1 * y_c + stride2 * x2; -+} -+ -+static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) -+{ -+ return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y); -+} -+ -+static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y) -+{ -+ return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y); -+} -+ -+#endif -+ - -From 89b8d6ac2a886749d4594656083753e682de05a7 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 27 Apr 2021 11:36:47 +0100 -Subject: [PATCH 003/136] Add aarch64 asm sand conv functions - -Many thanks to eiler.mike@gmail.com (Michael Eiler) for these -optimizations ---- - libavutil/aarch64/Makefile | 2 + - libavutil/aarch64/rpi_sand_neon.S | 676 ++++++++++++++++++++++++++++++ - libavutil/aarch64/rpi_sand_neon.h | 55 +++ - libavutil/rpi_sand_fn_pw.h | 4 +- - libavutil/rpi_sand_fns.c | 3 + - 5 files changed, 738 insertions(+), 2 deletions(-) - create mode 100644 libavutil/aarch64/rpi_sand_neon.S - create mode 100644 libavutil/aarch64/rpi_sand_neon.h - -diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile -index eba0151337..1b44beab39 100644 ---- a/libavutil/aarch64/Makefile -+++ b/libavutil/aarch64/Makefile -@@ -4,3 +4,5 @@ OBJS += aarch64/cpu.o \ - - NEON-OBJS += aarch64/float_dsp_neon.o \ - aarch64/tx_float_neon.o \ -+ aarch64/rpi_sand_neon.o \ -+ -diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S -new file mode 100644 -index 0000000000..cdcf71ee67 ---- /dev/null -+++ b/libavutil/aarch64/rpi_sand_neon.S -@@ -0,0 +1,676 @@ -+/* -+Copyright (c) 2021 Michael Eiler -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: Michael Eiler -+*/ -+ -+#include "asm.S" -+ -+// void ff_rpi_sand8_lines_to_planar_y8( -+// uint8_t * dest, : x0 -+// unsigned int dst_stride, : w1 -+// const uint8_t * src, : x2 -+// unsigned int src_stride1, : w3, always 128 -+// unsigned int src_stride2, : w4 -+// unsigned int _x, : w5 -+// unsigned int y, : w6 -+// unsigned int _w, : w7 -+// unsigned int h); : [sp, #0] -+ -+function ff_rpi_sand8_lines_to_planar_y8, export=1 -+ // w15 contains the number of rows we need to process -+ ldr w15, [sp, #0] -+ -+ // w8 will contain the number of blocks per row -+ // w8 = floor(_w/stride1) -+ // stride1 is assumed to always be 128 -+ mov w8, w1 -+ lsr w8, w8, #7 -+ -+ // in case the width of the image is not a multiple of 128, there will -+ // be an incomplete block at the end of every row -+ // w9 contains the number of pixels stored within this block -+ // w9 = _w - w8 * 128 -+ lsl w9, w8, #7 -+ sub w9, w7, w9 -+ -+ // this is the value we have to add to the src pointer after reading a complete block -+ // it will move the address to the start of the next block -+ // w10 = stride2 * stride1 - stride1 -+ mov w10, w4 -+ lsl w10, w10, #7 -+ sub w10, w10, #128 -+ -+ // w11 is the row offset, meaning the start offset of the first block of every collumn -+ // this will be increased with stride1 within every iteration of the row_loop -+ eor w11, w11, w11 -+ -+ // w12 = 0, processed row count -+ eor w12, w12, w12 -+row_loop: -+ // start of the first block within the current row -+ // x13 = row offset + src -+ mov x13, x2 -+ add x13, x13, x11 -+ -+ // w14 = 0, processed block count -+ eor w14, w14, w14 -+ -+ cmp w8, #0 -+ beq no_main_y8 -+ -+block_loop: -+ // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128 -+ // fortunately these aren't callee saved ones, meaning we don't need to backup them -+ ld1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x13], #64 -+ ld1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x13], #64 -+ -+ // write these registers back to the destination vector and increase the dst address by 128 -+ st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64 -+ st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x0], #64 -+ -+ // move the source register to the beginning of the next block (x13 = src + block offset) -+ add x13, x13, x10 -+ // increase the block counter -+ add w14, w14, #1 -+ -+ // continue with the block_loop if we haven't copied all full blocks yet -+ cmp w8, w14 -+ bgt block_loop -+ -+ // handle the last block at the end of each row -+ // at most 127 byte values copied from src to dst -+no_main_y8: -+ eor w5, w5, w5 // i = 0 -+incomplete_block_loop_y8: -+ cmp w5, w9 -+ bge incomplete_block_loop_end_y8 -+ -+ ldrb w6, [x13] -+ strb w6, [x0] -+ add x13, x13, #1 -+ add x0, x0, #1 -+ -+ add w5, w5, #1 -+ b incomplete_block_loop_y8 -+incomplete_block_loop_end_y8: -+ -+ -+ // increase the row offset by 128 (stride1) -+ add w11, w11, #128 -+ // increment the row counter -+ add w12, w12, #1 -+ -+ // process the next row if we haven't finished yet -+ cmp w15, w12 -+ bgt row_loop -+ -+ ret -+endfunc -+ -+ -+ -+// void ff_rpi_sand8_lines_to_planar_c8( -+// uint8_t * dst_u, : x0 -+// unsigned int dst_stride_u, : w1 == width -+// uint8_t * dst_v, : x2 -+// unsigned int dst_stride_v, : w3 == width -+// const uint8_t * src, : x4 -+// unsigned int stride1, : w5 == 128 -+// unsigned int stride2, : w6 -+// unsigned int _x, : w7 -+// unsigned int y, : [sp, #0] -+// unsigned int _w, : [sp, #8] -+// unsigned int h); : [sp, #16] -+ -+function ff_rpi_sand8_lines_to_planar_c8, export=1 -+ // w7 = width -+ ldr w7, [sp, #8] -+ -+ // w15 contains the number of rows we need to process -+ // counts down -+ ldr w15, [sp, #16] -+ -+ // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6 -+ mov w8, w7 -+ lsr w8, w8, #6 -+ -+ // number of pixels in block at the end of every row -+ // w9 = _w - (w8 * 64) -+ lsl w9, w8, #6 -+ sub w9, w7, w9 -+ -+ // Skip at the end of the line to account for stride -+ sub w12, w1, w7 -+ -+ // address delta to the beginning of the next block -+ // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128 -+ lsl w10, w6, #7 -+ sub w10, w10, #128 -+ -+ // w11 = row address start offset = 0 -+ eor w11, w11, w11 -+ -+row_loop_c8: -+ // start of the first block within the current row -+ // x13 = row offset + src -+ mov x13, x4 -+ add x13, x13, x11 -+ -+ // w14 = 0, processed block count -+ eor w14, w14, w14 -+ -+ cmp w8, #0 -+ beq no_main_c8 -+ -+block_loop_c8: -+ // load the full block -> 128 bytes, the block contains 64 interleaved U and V values -+ ld2 { v0.16b, v1.16b }, [x13], #32 -+ ld2 { v2.16b, v3.16b }, [x13], #32 -+ ld2 { v4.16b, v5.16b }, [x13], #32 -+ ld2 { v6.16b, v7.16b }, [x13], #32 -+ -+ // swap register so that we can write them out with a single instruction -+ mov v16.16b, v1.16b -+ mov v17.16b, v3.16b -+ mov v18.16b, v5.16b -+ mov v1.16b, v2.16b -+ mov v2.16b, v4.16b -+ mov v3.16b, v6.16b -+ mov v4.16b, v16.16b -+ mov v5.16b, v17.16b -+ mov v6.16b, v18.16b -+ -+ st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64 -+ st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x2], #64 -+ -+ // increment row counter and move src to the beginning of the next block -+ add w14, w14, #1 -+ add x13, x13, x10 -+ -+ // jump to block_loop_c8 iff the block count is smaller than the number of full blocks -+ cmp w8, w14 -+ bgt block_loop_c8 -+ -+no_main_c8: -+ // handle incomplete block at the end of every row -+ eor w5, w5, w5 // point counter, this might be -+incomplete_block_loop_c8: -+ cmp w5, w9 -+ bge incomplete_block_loop_end_c8 -+ -+ ldrb w1, [x13] -+ strb w1, [x0] -+ add x13, x13, #1 -+ -+ ldrb w1, [x13] -+ strb w1, [x2] -+ add x13, x13, #1 -+ -+ add x0, x0, #1 -+ add x2, x2, #1 -+ -+ add w5, w5, #1 -+ b incomplete_block_loop_c8 -+incomplete_block_loop_end_c8: -+ -+ // increase row_offset by stride1 -+ add w11, w11, #128 -+ add x0, x0, w12, sxtw -+ add x2, x2, w12, sxtw -+ -+ // jump to row_Loop_c8 iff the row count is small than the height -+ subs w15, w15, #1 -+ bgt row_loop_c8 -+ -+ ret -+endfunc -+ -+//void ff_rpi_sand30_lines_to_planar_y16( -+// uint8_t * dest, // [x0] -+// unsigned int dst_stride, // [w1] -> assumed to be equal to _w -+// const uint8_t * src, // [x2] -+// unsigned int src_stride1, // [w3] -> 128 -+// unsigned int src_stride2, // [w4] -+// unsigned int _x, // [w5] -+// unsigned int y, // [w6] -+// unsigned int _w, // [w7] -+// unsigned int h); // [sp, #0] -+ -+function ff_rpi_sand30_lines_to_planar_y16, export=1 -+ stp x19, x20, [sp, #-48]! -+ stp x21, x22, [sp, #16] -+ stp x23, x24, [sp, #32] -+ -+ // w6 = argument h -+ ldr w6, [sp, #48] -+ -+ // slice_inc = ((stride2 - 1) * stride1) -+ mov w5, w4 -+ sub w5, w5, #1 -+ lsl w5, w5, #7 -+ -+ // total number of bytes per row = (width / 3) * 4 -+ mov w8, w7 -+ mov w9, #3 -+ udiv w8, w8, w9 -+ lsl w8, w8, #2 -+ -+ // number of full 128 byte blocks to be processed -+ mov w9, #96 -+ udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96 -+ -+ // w10 = number of full integers to process (4 bytes) -+ // w11 = remaning zero to two 10bit values still to copy over -+ mov w12, #96 -+ mul w12, w9, w12 -+ sub w12, w7, w12 // width - blocks*96 = remaining points per row -+ mov w11, #3 -+ udiv w10, w12, w11 // full integers to process = w12 / 3 -+ mul w11, w10, w11 // #integers *3 -+ sub w11, w12, w11 // remaining 0-2 points = remaining points - integers*3 -+ -+ // increase w9 by one if w10+w11 is not zero, and decrease the row count by one -+ // this is to efficiently copy incomplete blocks at the end of the rows -+ // the last row is handled explicitly to avoid writing out of bounds -+ add w22, w10, w11 -+ cmp w22, #0 -+ cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise -+ add w9, w9, w22 -+ sub w6, w6, #1 -+ -+ // store the number of bytes in w20 which we copy too much for every row -+ // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values) -+ mov w20, #96*2 -+ mul w20, w20, w9 -+ sub w20, w1, w20 -+ -+ mov w23, #0 // flag to check whether the last line had already been processed -+ -+ // bitmask to clear the uppper 6bits of the result values -+ mov x19, #0x03ff03ff03ff03ff -+ dup v22.2d, x19 -+ -+ // row counter = 0 -+ eor w12, w12, w12 -+row_loop_y16: -+ cmp w12, w6 // jump to row_loop_y16_fin if we processed all rows -+ bge row_loop_y16_fin -+ -+ mov x13, x2 // row src -+ eor w14, w14, w14 // full block counter -+block_loop_y16: -+ cmp w14, w9 -+ bge block_loop_y16_fin -+ -+ // load 64 bytes -+ ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64 -+ -+ // process v0 and v1 -+ xtn v16.4h, v0.4s -+ ushr v0.4s, v0.4s, #10 -+ xtn v17.4h, v0.4s -+ ushr v0.4s, v0.4s, #10 -+ xtn v18.4h, v0.4s -+ -+ xtn2 v16.8h, v1.4s -+ and v16.16b, v16.16b, v22.16b -+ ushr v1.4s, v1.4s, #10 -+ xtn2 v17.8h, v1.4s -+ and v17.16b, v17.16b, v22.16b -+ ushr v1.4s, v1.4s, #10 -+ xtn2 v18.8h, v1.4s -+ and v18.16b, v18.16b, v22.16b -+ -+ st3 { v16.8h, v17.8h, v18.8h }, [x0], #48 -+ -+ // process v2 and v3 -+ xtn v23.4h, v2.4s -+ ushr v2.4s, v2.4s, #10 -+ xtn v24.4h, v2.4s -+ ushr v2.4s, v2.4s, #10 -+ xtn v25.4h, v2.4s -+ -+ xtn2 v23.8h, v3.4s -+ and v23.16b, v23.16b, v22.16b -+ ushr v3.4s, v3.4s, #10 -+ xtn2 v24.8h, v3.4s -+ and v24.16b, v24.16b, v22.16b -+ ushr v3.4s, v3.4s, #10 -+ xtn2 v25.8h, v3.4s -+ and v25.16b, v25.16b, v22.16b -+ -+ st3 { v23.8h, v24.8h, v25.8h }, [x0], #48 -+ -+ // load the second half of the block -> 64 bytes into registers v4-v7 -+ ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x13], #64 -+ -+ // process v4 and v5 -+ xtn v16.4h, v4.4s -+ ushr v4.4s, v4.4s, #10 -+ xtn v17.4h, v4.4s -+ ushr v4.4s, v4.4s, #10 -+ xtn v18.4h, v4.4s -+ -+ xtn2 v16.8h, v5.4s -+ and v16.16b, v16.16b, v22.16b -+ ushr v5.4s, v5.4s, #10 -+ xtn2 v17.8h, v5.4s -+ and v17.16b, v17.16b, v22.16b -+ ushr v5.4s, v5.4s, #10 -+ xtn2 v18.8h, v5.4s -+ and v18.16b, v18.16b, v22.16b -+ -+ st3 { v16.8h, v17.8h, v18.8h }, [x0], #48 -+ -+ // v6 and v7 -+ xtn v23.4h, v6.4s -+ ushr v6.4s, v6.4s, #10 -+ xtn v24.4h, v6.4s -+ ushr v6.4s, v6.4s, #10 -+ xtn v25.4h, v6.4s -+ -+ xtn2 v23.8h, v7.4s -+ and v23.16b, v23.16b, v22.16b -+ ushr v7.4s, v7.4s, #10 -+ xtn2 v24.8h, v7.4s -+ and v24.16b, v24.16b, v22.16b -+ ushr v7.4s, v7.4s, #10 -+ xtn2 v25.8h, v7.4s -+ and v25.16b, v25.16b, v22.16b -+ -+ st3 { v23.8h, v24.8h, v25.8h }, [x0], #48 -+ -+ add x13, x13, x5 // row src += slice_inc -+ add w14, w14, #1 -+ b block_loop_y16 -+block_loop_y16_fin: -+ -+ -+ -+ -+ add x2, x2, #128 // src += stride1 (start of the next row) -+ add x0, x0, w20, sxtw // subtract the bytes we copied too much from dst -+ add w12, w12, #1 -+ b row_loop_y16 -+row_loop_y16_fin: -+ -+ // check whether we have incomplete blocks at the end of every row -+ // in that case decrease row block count by one -+ // change height back to it's original value (meaning increase it by 1) -+ // and jump back to another iteration of row_loop_y16 -+ -+ cmp w23, #1 -+ beq row_loop_y16_fin2 // don't continue here if we already processed the last row -+ add w6, w6, #1 // increase height to the original value -+ sub w9, w9, w22 // block count - 1 or 0, depending on the remaining bytes count -+ mov w23, #1 -+ b row_loop_y16 -+row_loop_y16_fin2: -+ -+ sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference -+ -+ // now we've got to handle the last block in the last row -+ eor w12, w12, w12 // w12 = 0 = counter -+integer_loop_y16: -+ cmp w12, w10 -+ bge integer_loop_y16_fin -+ ldr w14, [x13], #4 -+ and w15, w14, #0x3ff -+ strh w15, [x0], #2 -+ lsr w14, w14, #10 -+ and w15, w14, #0x3ff -+ strh w15, [x0], #2 -+ lsr w14, w14, #10 -+ and w15, w14, #0x3ff -+ strh w15, [x0], #2 -+ add w12, w12, #1 -+ b integer_loop_y16 -+integer_loop_y16_fin: -+ -+final_values_y16: -+ // remaining point count = w11 -+ ldr w14, [x13], #4 -+ cmp w11, #0 -+ beq final_values_y16_fin -+ and w15, w14, #0x3ff -+ strh w15, [x0], #2 -+ cmp w11, #1 -+ beq final_values_y16_fin -+ lsr w14, w14, #10 -+ and w15, w14, #0x3ff -+ strh w15, [x0], #2 -+final_values_y16_fin: -+ -+ ldp x23, x24, [sp, #32] -+ ldp x21, x22, [sp, #16] -+ ldp x19, x20, [sp], #48 -+ ret -+endfunc -+ -+//void ff_rpi_sand30_lines_to_planar_c16( -+// uint8_t * dst_u, // [x0] -+// unsigned int dst_stride_u, // [w1] == _w*2 -+// uint8_t * dst_v, // [x2] -+// unsigned int dst_stride_v, // [w3] == _w*2 -+// const uint8_t * src, // [x4] -+// unsigned int stride1, // [w5] == 128 -+// unsigned int stride2, // [w6] -+// unsigned int _x, // [w7] == 0 -+// unsigned int y, // [sp, #0] == 0 -+// unsigned int _w, // [sp, #8] -> w3 -+// unsigned int h); // [sp, #16] -> w7 -+ -+.macro rpi_sand30_lines_to_planar_c16_block_half -+ ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64 -+ -+ xtn v4.4h, v0.4s -+ ushr v0.4s, v0.4s, #10 -+ xtn v5.4h, v0.4s -+ ushr v0.4s, v0.4s, #10 -+ xtn v6.4h, v0.4s -+ xtn2 v4.8h, v1.4s -+ ushr v1.4s, v1.4s, #10 -+ xtn2 v5.8h, v1.4s -+ ushr v1.4s, v1.4s, #10 -+ xtn2 v6.8h, v1.4s -+ and v4.16b, v4.16b, v16.16b -+ and v5.16b, v5.16b, v16.16b -+ and v6.16b, v6.16b, v16.16b -+ st3 { v4.8h, v5.8h, v6.8h }, [sp], #48 -+ -+ xtn v4.4h, v2.4s -+ ushr v2.4s, v2.4s, #10 -+ xtn v5.4h, v2.4s -+ ushr v2.4s, v2.4s, #10 -+ xtn v6.4h, v2.4s -+ xtn2 v4.8h, v3.4s -+ ushr v3.4s, v3.4s, #10 -+ xtn2 v5.8h, v3.4s -+ ushr v3.4s, v3.4s, #10 -+ xtn2 v6.8h, v3.4s -+ and v4.16b, v4.16b, v16.16b -+ and v5.16b, v5.16b, v16.16b -+ and v6.16b, v6.16b, v16.16b -+ st3 { v4.8h, v5.8h, v6.8h }, [sp] -+ sub sp, sp, #48 -+.endm -+ -+function ff_rpi_sand30_lines_to_planar_c16, export=1 -+ stp x19, x20, [sp, #-48]! -+ stp x21, x22, [sp, #16] -+ stp x23, x24, [sp, #32] -+ -+ ldr w3, [sp, #48+8] // w3 = width -+ ldr w7, [sp, #48+16] // w7 = height -+ -+ // reserve space on the stack for intermediate results -+ sub sp, sp, #256 -+ -+ // number of 128byte blocks per row, w8 = width / 48 -+ mov w9, #48 -+ udiv w8, w3, w9 -+ -+ // remaining pixels (rem_pix) per row, w9 = width - w8 * 48 -+ mul w9, w8, w9 -+ sub w9, w3, w9 -+ -+ // row offset, the beginning of the next row to process -+ eor w10, w10, w10 -+ -+ // offset to the beginning of the next block, w11 = stride2 * 128 - 128 -+ lsl w11, w6, #7 -+ sub w11, w11, #128 -+ -+ // decrease the height by one and in case of remaining pixels increase the block count by one -+ sub w7, w7, #1 -+ cmp w9, #0 -+ cset w19, ne // w19 == 1 iff reamining pixels != 0 -+ add w8, w8, w19 -+ -+ // bytes we have to move dst back by at the end of every row -+ mov w21, #48*2 -+ mul w21, w21, w8 -+ sub w21, w1, w21 -+ -+ mov w20, #0 // w20 = flag, last row processed -+ -+ mov x12, #0x03ff03ff03ff03ff -+ dup v16.2d, x12 -+ -+ // iterate through rows, row counter = w12 = 0 -+ eor w12, w12, w12 -+row_loop_c16: -+ cmp w12, w7 -+ bge row_loop_c16_fin -+ -+ // address of row data = src + row_offset -+ mov x13, x4 -+ add x13, x13, x10 -+ -+ eor w14, w14, w14 -+block_loop_c16: -+ cmp w14, w8 -+ bge block_loop_c16_fin -+ -+ rpi_sand30_lines_to_planar_c16_block_half -+ -+ ld2 { v0.8h, v1.8h }, [sp], #32 -+ ld2 { v2.8h, v3.8h }, [sp], #32 -+ ld2 { v4.8h, v5.8h }, [sp] -+ sub sp, sp, #64 -+ -+ st1 { v0.8h }, [x0], #16 -+ st1 { v2.8h }, [x0], #16 -+ st1 { v4.8h }, [x0], #16 -+ st1 { v1.8h }, [x2], #16 -+ st1 { v3.8h }, [x2], #16 -+ st1 { v5.8h }, [x2], #16 -+ -+ rpi_sand30_lines_to_planar_c16_block_half -+ -+ ld2 { v0.8h, v1.8h }, [sp], #32 -+ ld2 { v2.8h, v3.8h }, [sp], #32 -+ ld2 { v4.8h, v5.8h }, [sp] -+ sub sp, sp, #64 -+ -+ st1 { v0.8h }, [x0], #16 -+ st1 { v2.8h }, [x0], #16 -+ st1 { v4.8h }, [x0], #16 -+ st1 { v1.8h }, [x2], #16 -+ st1 { v3.8h }, [x2], #16 -+ st1 { v5.8h }, [x2], #16 -+ -+ add x13, x13, x11 // offset to next block -+ add w14, w14, #1 -+ b block_loop_c16 -+block_loop_c16_fin: -+ -+ add w10, w10, #128 -+ add w12, w12, #1 -+ add x0, x0, w21, sxtw // move dst pointers back by x21 -+ add x2, x2, w21, sxtw -+ b row_loop_c16 -+row_loop_c16_fin: -+ -+ cmp w20, #1 -+ beq row_loop_c16_fin2 -+ mov w20, #1 -+ sub w8, w8, w19 // decrease block count by w19 -+ add w7, w7, #1 // increase height -+ b row_loop_c16 -+ -+row_loop_c16_fin2: -+ sub x0, x0, w21, sxtw // readd x21 in case of the last row -+ sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels -+ -+ // last incomplete block to be finished -+ // read operations are fine, stride2 is more than large enough even if rem_pix is 0 -+ rpi_sand30_lines_to_planar_c16_block_half -+ ld2 { v0.8h, v1.8h }, [sp], #32 -+ ld2 { v2.8h, v3.8h }, [sp], #32 -+ ld2 { v4.8h, v5.8h }, [sp], #32 -+ rpi_sand30_lines_to_planar_c16_block_half -+ ld2 { v0.8h, v1.8h }, [sp], #32 -+ ld2 { v2.8h, v3.8h }, [sp], #32 -+ ld2 { v4.8h, v5.8h }, [sp] -+ sub sp, sp, #160 -+ -+ mov x4, sp -+ eor w20, w20, w20 -+rem_pix_c16_loop: -+ cmp w20, w9 -+ bge rem_pix_c16_fin -+ -+ ldr w22, [x4], #4 -+ str w22, [x0], #2 -+ lsr w22, w22, #16 -+ str w22, [x2], #2 -+ -+ add w20, w20, #1 -+ b rem_pix_c16_loop -+rem_pix_c16_fin: -+ -+ add sp, sp, #256 -+ -+ ldp x23, x24, [sp, #32] -+ ldp x21, x22, [sp, #16] -+ ldp x19, x20, [sp], #48 -+ ret -+endfunc -+ -+ -+ -+//void ff_rpi_sand30_lines_to_planar_p010( -+// uint8_t * dest, -+// unsigned int dst_stride, -+// const uint8_t * src, -+// unsigned int src_stride1, -+// unsigned int src_stride2, -+// unsigned int _x, -+// unsigned int y, -+// unsigned int _w, -+// unsigned int h); -+ -diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h -new file mode 100644 -index 0000000000..b3aa481ea4 ---- /dev/null -+++ b/libavutil/aarch64/rpi_sand_neon.h -@@ -0,0 +1,55 @@ -+/* -+Copyright (c) 2021 Michael Eiler -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: Michael Eiler -+*/ -+ -+#pragma once -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+ -+void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride, -+ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, -+ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); -+ -+void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u, -+ uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+ -+void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride, -+ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, -+ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); -+ -+void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u, -+ uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1, -+ unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); -+ -+#ifdef __cplusplus -+} -+#endif -+ -diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h -index 0324f6826d..0d5d203dc3 100644 ---- a/libavutil/rpi_sand_fn_pw.h -+++ b/libavutil/rpi_sand_fn_pw.h -@@ -54,7 +54,7 @@ void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride, - const unsigned int w = _w; - const unsigned int mask = stride1 - 1; - --#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64) -+#if PW == 1 && HAVE_SAND_ASM - if (_x == 0) { - ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride, - src, stride1, stride2, _x, y, _w, h); -@@ -106,7 +106,7 @@ void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_strid - const unsigned int w = _w * 2; - const unsigned int mask = stride1 - 1; - --#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64) -+#if PW == 1 && HAVE_SAND_ASM - if (_x == 0) { - ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v, - src, stride1, stride2, _x, y, _w, h); -diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c -index ed0261b02f..1f543e9357 100644 ---- a/libavutil/rpi_sand_fns.c -+++ b/libavutil/rpi_sand_fns.c -@@ -37,6 +37,9 @@ Authors: John Cox - #if ARCH_ARM && HAVE_NEON - #include "arm/rpi_sand_neon.h" - #define HAVE_SAND_ASM 1 -+#elif ARCH_AARCH64 && HAVE_NEON -+#include "aarch64/rpi_sand_neon.h" -+#define HAVE_SAND_ASM 1 - #else - #define HAVE_SAND_ASM 0 - #endif - -From 247025a42ae09d6c9c5d4128a5e4b288b7b3047c Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 27 Apr 2021 11:56:02 +0100 -Subject: [PATCH 004/136] Add raw encoding for sand - ---- - libavcodec/raw.c | 6 +++ - libavcodec/rawenc.c | 92 ++++++++++++++++++++++++++++++++++++++++++++- - 2 files changed, 96 insertions(+), 2 deletions(-) - -diff --git a/libavcodec/raw.c b/libavcodec/raw.c -index 1e5b48d1e0..1e689f9ee0 100644 ---- a/libavcodec/raw.c -+++ b/libavcodec/raw.c -@@ -295,6 +295,12 @@ static const PixelFormatTag raw_pix_fmt_tags[] = { - { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ - { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ - -+ /* RPI (Might as well define for everything) */ -+ { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') }, -+ { AV_PIX_FMT_RPI4_8, MKTAG('S', 'A', 'N', 'D') }, -+ { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') }, -+ { AV_PIX_FMT_RPI4_10, MKTAG('S', 'N', 'D', 'B') }, -+ - { AV_PIX_FMT_NONE, 0 }, - }; - -diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c -index 8c577006d9..594a77c42a 100644 ---- a/libavcodec/rawenc.c -+++ b/libavcodec/rawenc.c -@@ -24,6 +24,7 @@ - * Raw Video Encoder - */ - -+#include "config.h" - #include "avcodec.h" - #include "codec_internal.h" - #include "encode.h" -@@ -33,6 +34,10 @@ - #include "libavutil/intreadwrite.h" - #include "libavutil/imgutils.h" - #include "libavutil/internal.h" -+#include "libavutil/avassert.h" -+#if CONFIG_SAND -+#include "libavutil/rpi_sand_fns.h" -+#endif - - static av_cold int raw_encode_init(AVCodecContext *avctx) - { -@@ -46,12 +51,95 @@ static av_cold int raw_encode_init(AVCodecContext *avctx) - return 0; - } - -+#if CONFIG_SAND -+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, -+ const AVFrame *frame) -+{ -+ const int width = av_frame_cropped_width(frame); -+ const int height = av_frame_cropped_height(frame); -+ const int x0 = frame->crop_left; -+ const int y0 = frame->crop_top; -+ const int size = width * height * 3 / 2; -+ uint8_t * dst; -+ int ret; -+ -+ if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0) -+ return ret; -+ -+ dst = pkt->data; -+ -+ av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); -+ dst += width * height; -+ av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2, -+ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2); -+ return 0; -+} -+ -+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, -+ const AVFrame *frame) -+{ -+ const int width = av_frame_cropped_width(frame); -+ const int height = av_frame_cropped_height(frame); -+ const int x0 = frame->crop_left; -+ const int y0 = frame->crop_top; -+ const int size = width * height * 3; -+ uint8_t * dst; -+ int ret; -+ -+ if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0) -+ return ret; -+ -+ dst = pkt->data; -+ -+ av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height); -+ dst += width * height * 2; -+ av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width, -+ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2); -+ return 0; -+} -+ -+static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, -+ const AVFrame *frame) -+{ -+ const int width = av_frame_cropped_width(frame); -+ const int height = av_frame_cropped_height(frame); -+ const int x0 = frame->crop_left; -+ const int y0 = frame->crop_top; -+ const int size = width * height * 3; -+ uint8_t * dst; -+ int ret; -+ -+ if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0) -+ return ret; -+ -+ dst = pkt->data; -+ -+ av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); -+ dst += width * height * 2; -+ av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width, -+ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2); -+ return 0; -+} -+#endif -+ -+ - static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, - const AVFrame *frame, int *got_packet) - { -- int ret = av_image_get_buffer_size(frame->format, -- frame->width, frame->height, 1); -+ int ret; - -+#if CONFIG_SAND -+ if (av_rpi_is_sand_frame(frame)) { -+ ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : -+ av_rpi_is_sand16_frame(frame) ? raw_sand16_as_yuv420(avctx, pkt, frame) : -+ av_rpi_is_sand30_frame(frame) ? raw_sand30_as_yuv420(avctx, pkt, frame) : -1; -+ *got_packet = (ret == 0); -+ return ret; -+ } -+#endif -+ -+ ret = av_image_get_buffer_size(frame->format, -+ frame->width, frame->height, 1); - if (ret < 0) - return ret; - - -From ac6961f424b56563dc793b6bc002a8c04cb1bc36 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 27 Apr 2021 12:02:09 +0100 -Subject: [PATCH 005/136] Deal with the lack of trivial sand cropping - ---- - fftools/ffmpeg.c | 4 ++-- - fftools/ffmpeg_filter.c | 4 ++-- - libavutil/frame.c | 11 +++++++++++ - libavutil/frame.h | 10 ++++++++++ - 4 files changed, 25 insertions(+), 4 deletions(-) - -diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c -index d721a5e721..15e084f0b2 100644 ---- a/fftools/ffmpeg.c -+++ b/fftools/ffmpeg.c -@@ -1993,8 +1993,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame, int keep_ref - av_channel_layout_compare(&ifilter->ch_layout, &frame->ch_layout); - break; - case AVMEDIA_TYPE_VIDEO: -- need_reinit |= ifilter->width != frame->width || -- ifilter->height != frame->height; -+ need_reinit |= ifilter->width != av_frame_cropped_width(frame) || -+ ifilter->height != av_frame_cropped_height(frame); - break; - } - -diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c -index 1f5bbf6c4d..f888307762 100644 ---- a/fftools/ffmpeg_filter.c -+++ b/fftools/ffmpeg_filter.c -@@ -1281,8 +1281,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame) - - ifilter->format = frame->format; - -- ifilter->width = frame->width; -- ifilter->height = frame->height; -+ ifilter->width = av_frame_cropped_width(frame); -+ ifilter->height = av_frame_cropped_height(frame); - ifilter->sample_aspect_ratio = frame->sample_aspect_ratio; - - ifilter->sample_rate = frame->sample_rate; -diff --git a/libavutil/frame.c b/libavutil/frame.c -index 9545477acc..48621e4098 100644 ---- a/libavutil/frame.c -+++ b/libavutil/frame.c -@@ -16,6 +16,8 @@ - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -+#include "config.h" -+ - #include "channel_layout.h" - #include "avassert.h" - #include "buffer.h" -@@ -27,6 +29,9 @@ - #include "mem.h" - #include "samplefmt.h" - #include "hwcontext.h" -+#if CONFIG_SAND -+#include "rpi_sand_fns.h" -+#endif - - #if FF_API_OLD_CHANNEL_LAYOUT - #define CHECK_CHANNELS_CONSISTENCY(frame) \ -@@ -874,6 +879,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags) - (frame->crop_top + frame->crop_bottom) >= frame->height) - return AVERROR(ERANGE); - -+#if CONFIG_SAND -+ // Sand cannot be cropped - do not try -+ if (av_rpi_is_sand_format(frame->format)) -+ return 0; -+#endif -+ - desc = av_pix_fmt_desc_get(frame->format); - if (!desc) - return AVERROR_BUG; -diff --git a/libavutil/frame.h b/libavutil/frame.h -index 2580269549..3a9d323325 100644 ---- a/libavutil/frame.h -+++ b/libavutil/frame.h -@@ -957,6 +957,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags); - */ - const char *av_frame_side_data_name(enum AVFrameSideDataType type); - -+ -+static inline int av_frame_cropped_width(const AVFrame * const frame) -+{ -+ return frame->width - (frame->crop_left + frame->crop_right); -+} -+static inline int av_frame_cropped_height(const AVFrame * const frame) -+{ -+ return frame->height - (frame->crop_top + frame->crop_bottom); -+} -+ - /** - * @} - */ - -From 9a08431f7790507b0374d9585dfc736000c1bd42 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 27 Apr 2021 12:31:16 +0100 -Subject: [PATCH 006/136] Add an unsand filter - ---- - configure | 1 + - libavfilter/Makefile | 1 + - libavfilter/allfilters.c | 1 + - libavfilter/buffersrc.c | 2 +- - libavfilter/vf_unsand.c | 228 +++++++++++++++++++++++++++++++++++++++ - 5 files changed, 232 insertions(+), 1 deletion(-) - create mode 100644 libavfilter/vf_unsand.c - -diff --git a/configure b/configure -index 27112ced58..7712482bd5 100755 ---- a/configure -+++ b/configure -@@ -3754,6 +3754,7 @@ tonemap_opencl_filter_deps="opencl const_nan" - transpose_opencl_filter_deps="opencl" - transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags" - transpose_vulkan_filter_deps="vulkan spirv_compiler" -+unsand_filter_select="sand" - unsharp_opencl_filter_deps="opencl" - uspp_filter_deps="gpl avcodec" - vaguedenoiser_filter_deps="gpl" -diff --git a/libavfilter/Makefile b/libavfilter/Makefile -index b3d3d981dd..c14fc995a0 100644 ---- a/libavfilter/Makefile -+++ b/libavfilter/Makefile -@@ -518,6 +518,7 @@ OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER) += vf_transpose_vaapi.o vaapi_vpp.o - OBJS-$(CONFIG_TRANSPOSE_VULKAN_FILTER) += vf_transpose_vulkan.o vulkan.o vulkan_filter.o - OBJS-$(CONFIG_TRIM_FILTER) += trim.o - OBJS-$(CONFIG_UNPREMULTIPLY_FILTER) += vf_premultiply.o framesync.o -+OBJS-$(CONFIG_UNSAND_FILTER) += vf_unsand.o - OBJS-$(CONFIG_UNSHARP_FILTER) += vf_unsharp.o - OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER) += vf_unsharp_opencl.o opencl.o \ - opencl/unsharp.o -diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c -index d7db46c2af..b990a00152 100644 ---- a/libavfilter/allfilters.c -+++ b/libavfilter/allfilters.c -@@ -490,6 +490,7 @@ extern const AVFilter ff_vf_trim; - extern const AVFilter ff_vf_unpremultiply; - extern const AVFilter ff_vf_unsharp; - extern const AVFilter ff_vf_unsharp_opencl; -+extern const AVFilter ff_vf_unsand; - extern const AVFilter ff_vf_untile; - extern const AVFilter ff_vf_uspp; - extern const AVFilter ff_vf_v360; -diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c -index ba17450b93..0dbe5d2335 100644 ---- a/libavfilter/buffersrc.c -+++ b/libavfilter/buffersrc.c -@@ -201,7 +201,7 @@ FF_ENABLE_DEPRECATION_WARNINGS - - switch (ctx->outputs[0]->type) { - case AVMEDIA_TYPE_VIDEO: -- CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height, -+ CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame), - frame->format, frame->pts); - break; - case AVMEDIA_TYPE_AUDIO: -diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c -new file mode 100644 -index 0000000000..7100f2fc9b ---- /dev/null -+++ b/libavfilter/vf_unsand.c -@@ -0,0 +1,228 @@ -+/* -+ * Copyright (c) 2007 Bobby Bingham ++ * Copyright (c) 2022 Ben Avison + * + * This file is part of FFmpeg. + * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. ++ * FFmpeg is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. + * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/** -+ * @file -+ * format and noformat video filters ++ * You should have received a copy of the GNU General Public License along ++ * with FFmpeg; if not, write to the Free Software Foundation, Inc., ++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include + -+#include "libavutil/internal.h" -+#include "libavutil/mem.h" -+#include "libavutil/pixdesc.h" -+#include "libavutil/opt.h" -+#include "libavutil/rpi_sand_fns.h" ++#include "checkasm.h" + -+#include "avfilter.h" -+#include "formats.h" -+#include "internal.h" -+#include "video.h" ++#include "libavcodec/idctdsp.h" + -+typedef struct UnsandContext { -+ const AVClass *class; -+} UnsandContext; -+ -+static av_cold void uninit(AVFilterContext *ctx) -+{ -+// UnsandContext *s = ctx->priv; -+} -+ -+static av_cold int init(AVFilterContext *ctx) -+{ -+// UnsandContext *s = ctx->priv; -+ -+ return 0; -+} -+ -+ -+static int filter_frame(AVFilterLink *link, AVFrame *in) -+{ -+ AVFilterLink * const outlink = link->dst->outputs[0]; -+ AVFrame *out = NULL; -+ int rv = 0; -+ -+ if (outlink->format == in->format) { -+ // If nothing to do then do nothing -+ out = in; -+ } -+ else -+ { -+ if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL) -+ { -+ rv = AVERROR(ENOMEM); -+ goto fail; -+ } -+ if (av_rpi_sand_to_planar_frame(out, in) != 0) -+ { -+ rv = -1; -+ goto fail; -+ } -+ -+ av_frame_free(&in); -+ } -+ -+ return ff_filter_frame(outlink, out); -+ -+fail: -+ av_frame_free(&out); -+ av_frame_free(&in); -+ return rv; -+} -+ -+#if 0 -+static void dump_fmts(const AVFilterFormats * fmts) -+{ -+ int i; -+ if (fmts== NULL) { -+ printf("NULL\n"); -+ return; -+ } -+ for (i = 0; i < fmts->nb_formats; ++i) { -+ printf(" %d", fmts->formats[i]); -+ } -+ printf("\n"); -+} -+#endif -+ -+static int query_formats(AVFilterContext *ctx) -+{ -+// UnsandContext *s = ctx->priv; -+ int ret; -+ -+ // If we aren't connected at both ends then just do nothing -+ if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL) -+ return 0; -+ -+ // Our output formats depend on our input formats and we can't/don't -+ // want to convert between bit depths so we need to wait for the source -+ // to have an opinion before we do -+ if (ctx->inputs[0]->incfg.formats == NULL) -+ return AVERROR(EAGAIN); -+ -+ // Accept anything -+ if (ctx->inputs[0]->outcfg.formats == NULL && -+ (ret = ff_formats_ref(ctx->inputs[0]->incfg.formats, &ctx->inputs[0]->outcfg.formats)) < 0) -+ return ret; -+ -+ // Filter out sand formats -+ -+ // Generate a container if we don't already have one -+ if (ctx->outputs[0]->incfg.formats == NULL) -+ { -+ // Somewhat rubbish way of ensuring we have a good structure -+ const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE}; -+ AVFilterFormats *formats = ff_make_format_list(out_fmts); -+ -+ if (formats == NULL) -+ return AVERROR(ENOMEM); -+ if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0) -+ return ret; -+ } -+ -+ // Replace old format list with new filtered list derived from what our -+ // input says it can do -+ { -+ const AVFilterFormats * const src_ff = ctx->inputs[0]->outcfg.formats; -+ AVFilterFormats * const dst_ff = ctx->outputs[0]->incfg.formats; -+ enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats); -+ int i; -+ int n = 0; -+ int seen_420p = 0; -+ int seen_420p10 = 0; -+ -+ for (i = 0; i < src_ff->nb_formats; ++i) { -+ const enum AVPixelFormat f = src_ff->formats[i]; -+ -+ switch (f){ -+ case AV_PIX_FMT_YUV420P: -+ case AV_PIX_FMT_SAND128: -+ case AV_PIX_FMT_RPI4_8: -+ if (!seen_420p) { -+ seen_420p = 1; -+ dst_fmts[n++] = AV_PIX_FMT_YUV420P; -+ } -+ break; -+ case AV_PIX_FMT_SAND64_10: -+ case AV_PIX_FMT_YUV420P10: -+ case AV_PIX_FMT_RPI4_10: -+ if (!seen_420p10) { -+ seen_420p10 = 1; -+ dst_fmts[n++] = AV_PIX_FMT_YUV420P10; -+ } -+ break; -+ default: -+ dst_fmts[n++] = f; -+ break; -+ } -+ } -+ -+ av_freep(&dst_ff->formats); -+ dst_ff->formats = dst_fmts; -+ dst_ff->nb_formats = n; -+ } -+ -+// printf("Unsand: %s calc: ", __func__); -+// dump_fmts(ctx->outputs[0]->incfg.formats); -+ -+ return 0; -+} -+ -+ -+#define OFFSET(x) offsetof(UnsandContext, x) -+static const AVOption unsand_options[] = { -+ { NULL } -+}; -+ -+ -+AVFILTER_DEFINE_CLASS(unsand); -+ -+static const AVFilterPad avfilter_vf_unsand_inputs[] = { -+ { -+ .name = "default", -+ .type = AVMEDIA_TYPE_VIDEO, -+ .filter_frame = filter_frame, -+ }, -+ { NULL } -+}; -+ -+static const AVFilterPad avfilter_vf_unsand_outputs[] = { -+ { -+ .name = "default", -+ .type = AVMEDIA_TYPE_VIDEO -+ }, -+}; -+ -+AVFilter ff_vf_unsand = { -+ .name = "unsand", -+ .description = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"), -+ -+ .init = init, -+ .uninit = uninit, -+ -+ FILTER_QUERY_FUNC(query_formats), -+ -+ .priv_size = sizeof(UnsandContext), -+ .priv_class = &unsand_class, -+ -+ FILTER_INPUTS(avfilter_vf_unsand_inputs), -+ FILTER_OUTPUTS(avfilter_vf_unsand_outputs), -+}; -+ - -From 6e61007b19544c573f1c2a4c6060d3d24b8d500e Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 27 Apr 2021 12:37:07 +0100 -Subject: [PATCH 007/136] Reduce mmal compile warnings - ---- - libavcodec/mmaldec.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c -index 3092f58510..6f41b41ac4 100644 ---- a/libavcodec/mmaldec.c -+++ b/libavcodec/mmaldec.c -@@ -24,6 +24,9 @@ - * MMAL Video Decoder - */ - -+#pragma GCC diagnostic push -+// Many many redundant decls in the header files -+#pragma GCC diagnostic ignored "-Wredundant-decls" - #include - #include - #include -@@ -31,6 +34,7 @@ - #include - #include - #include -+#pragma GCC diagnostic pop - #include - - #include "avcodec.h" - -From 01aff455665e8f889330519096912ad0005add3c Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 27 Apr 2021 17:56:16 +0100 -Subject: [PATCH 008/136] Add chroma location to hevc parse - ---- - libavcodec/hevc_parser.c | 13 +++++++++++++ - libavcodec/hevcdec.c | 13 +++++++++++++ - 2 files changed, 26 insertions(+) - -diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c -index 59f9a0ff3e..4ae7222e8b 100644 ---- a/libavcodec/hevc_parser.c -+++ b/libavcodec/hevc_parser.c -@@ -97,6 +97,19 @@ static int hevc_parse_slice_header(AVCodecParserContext *s, H2645NAL *nal, - avctx->profile = ps->sps->ptl.general_ptl.profile_idc; - avctx->level = ps->sps->ptl.general_ptl.level_idc; - -+ if (ps->sps->chroma_format_idc == 1) { -+ avctx->chroma_sample_location = ps->sps->vui.common.chroma_loc_info_present_flag ? -+ ps->sps->vui.common.chroma_sample_loc_type_top_field + 1 : -+ AVCHROMA_LOC_LEFT; -+ } -+ else if (ps->sps->chroma_format_idc == 2 || -+ ps->sps->chroma_format_idc == 3) { -+ avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;; -+ } -+ else { -+ avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED; -+ } -+ - if (ps->vps->vps_timing_info_present_flag) { - num = ps->vps->vps_num_units_in_tick; - den = ps->vps->vps_time_scale; -diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c -index 567e8d81d4..b6cfea64d3 100644 ---- a/libavcodec/hevcdec.c -+++ b/libavcodec/hevcdec.c -@@ -347,6 +347,19 @@ static void export_stream_params(HEVCContext *s, const HEVCSPS *sps) - else - avctx->color_range = AVCOL_RANGE_MPEG; - -+ if (sps->chroma_format_idc == 1) { -+ avctx->chroma_sample_location = sps->vui.common.chroma_loc_info_present_flag ? -+ sps->vui.common.chroma_sample_loc_type_top_field + 1 : -+ AVCHROMA_LOC_LEFT; -+ } -+ else if (sps->chroma_format_idc == 2 || -+ sps->chroma_format_idc == 3) { -+ avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;; -+ } -+ else { -+ avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED; -+ } -+ - if (sps->vui.common.colour_description_present_flag) { - avctx->color_primaries = sps->vui.common.colour_primaries; - avctx->color_trc = sps->vui.common.transfer_characteristics; - -From c80aad5d2fb373f7564e4257b1272f2decb06dd0 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 26 Sep 2022 18:20:50 +0100 -Subject: [PATCH 009/136] hwaccel: Add .abort_frame & use in hevcdec - ---- - libavcodec/avcodec.h | 11 +++++++++++ - libavcodec/hevcdec.c | 7 ++++++- - 2 files changed, 17 insertions(+), 1 deletion(-) - -diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h -index 39881a1d2b..32bc78e2be 100644 ---- a/libavcodec/avcodec.h -+++ b/libavcodec/avcodec.h -@@ -2221,6 +2221,17 @@ typedef struct AVHWAccel { - * that avctx->hwaccel_priv_data is invalid. - */ - int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx); -+ -+ /** -+ * Called if parsing fails -+ * -+ * An error has occured, end_frame will not be called -+ * start_frame & decode_slice may or may not have been called -+ * Optional -+ * -+ * @param avctx the codec context -+ */ -+ void (*abort_frame)(AVCodecContext *avctx); - } AVHWAccel; - - /** -diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c -index b6cfea64d3..8a0246fa21 100644 ---- a/libavcodec/hevcdec.c -+++ b/libavcodec/hevcdec.c -@@ -3375,8 +3375,13 @@ static int hevc_decode_frame(AVCodecContext *avctx, AVFrame *rframe, - - s->ref = NULL; - ret = decode_nal_units(s, avpkt->data, avpkt->size); -- if (ret < 0) -+ if (ret < 0) { -+ // Ensure that hwaccel knows this frame is over -+ if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame) -+ s->avctx->hwaccel->abort_frame(s->avctx); -+ - return ret; -+ } - - if (avctx->hwaccel) { - if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) { - -From 317722fd652d9a1c1700319c80fc71acf68ddde6 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 26 Sep 2022 18:26:17 +0100 -Subject: [PATCH 010/136] hwaccel: Add CAP_MT_SAFE for accels that can use - multi-thread - ---- - libavcodec/hwconfig.h | 1 + - libavcodec/pthread_frame.c | 7 +++++-- - 2 files changed, 6 insertions(+), 2 deletions(-) - -diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h -index 721424912c..c43ad55245 100644 ---- a/libavcodec/hwconfig.h -+++ b/libavcodec/hwconfig.h -@@ -24,6 +24,7 @@ - - - #define HWACCEL_CAP_ASYNC_SAFE (1 << 0) -+#define HWACCEL_CAP_MT_SAFE (1 << 1) - - - typedef struct AVCodecHWConfigInternal { -diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c -index d9d5afaa82..2cc89a41f5 100644 ---- a/libavcodec/pthread_frame.c -+++ b/libavcodec/pthread_frame.c -@@ -204,7 +204,8 @@ static attribute_align_arg void *frame_worker_thread(void *arg) - - /* if the previous thread uses hwaccel then we take the lock to ensure - * the threads don't run concurrently */ -- if (avctx->hwaccel) { -+ if (avctx->hwaccel && -+ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) { - pthread_mutex_lock(&p->parent->hwaccel_mutex); - p->hwaccel_serializing = 1; - } -@@ -590,7 +591,9 @@ void ff_thread_finish_setup(AVCodecContext *avctx) { - - if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return; - -- if (avctx->hwaccel && !p->hwaccel_serializing) { -+ if (avctx->hwaccel && -+ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) && -+ !p->hwaccel_serializing) { - pthread_mutex_lock(&p->parent->hwaccel_mutex); - p->hwaccel_serializing = 1; - } - -From 9005b263450e154a5ec5258fda17d5998fe7896b Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 27 Apr 2021 17:59:08 +0100 -Subject: [PATCH 011/136] Weak link utils - ---- - libavcodec/weak_link.c | 102 +++++++++++++++++++++++++++++++++++++++++ - libavcodec/weak_link.h | 23 ++++++++++ - 2 files changed, 125 insertions(+) - create mode 100644 libavcodec/weak_link.c - create mode 100644 libavcodec/weak_link.h - -diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c -new file mode 100644 -index 0000000000..f234a985b9 ---- /dev/null -+++ b/libavcodec/weak_link.c -@@ -0,0 +1,102 @@ -+#include -+#include -+#include -+#include "weak_link.h" -+ -+struct ff_weak_link_master { -+ atomic_int ref_count; /* 0 is single ref for easier atomics */ -+ pthread_rwlock_t lock; -+ void * ptr; -+}; -+ -+static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c) -+{ -+ return (struct ff_weak_link_master *)c; -+} -+ -+struct ff_weak_link_master * ff_weak_link_new(void * p) -+{ -+ struct ff_weak_link_master * w = malloc(sizeof(*w)); -+ if (!w) -+ return NULL; -+ w->ptr = p; -+ if (pthread_rwlock_init(&w->lock, NULL)) { -+ free(w); -+ return NULL; -+ } -+ return w; -+} -+ -+static void weak_link_do_unref(struct ff_weak_link_master * const w) -+{ -+ int n = atomic_fetch_sub(&w->ref_count, 1); -+ if (n) -+ return; -+ -+ pthread_rwlock_destroy(&w->lock); -+ free(w); -+} -+ -+// Unref & break link -+void ff_weak_link_break(struct ff_weak_link_master ** ppLink) -+{ -+ struct ff_weak_link_master * const w = *ppLink; -+ if (!w) -+ return; -+ -+ *ppLink = NULL; -+ pthread_rwlock_wrlock(&w->lock); -+ w->ptr = NULL; -+ pthread_rwlock_unlock(&w->lock); -+ -+ weak_link_do_unref(w); -+} -+ -+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w) -+{ -+ if (!w) -+ return NULL; -+ atomic_fetch_add(&w->ref_count, 1); -+ return (struct ff_weak_link_client*)w; -+} -+ -+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink) -+{ -+ struct ff_weak_link_master * const w = weak_link_x(*ppLink); -+ if (!w) -+ return; -+ -+ *ppLink = NULL; -+ weak_link_do_unref(w); -+} -+ -+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink) -+{ -+ struct ff_weak_link_master * const w = weak_link_x(*ppLink); -+ -+ if (!w) -+ return NULL; -+ -+ if (pthread_rwlock_rdlock(&w->lock)) -+ goto broken; -+ -+ if (w->ptr) -+ return w->ptr; -+ -+ pthread_rwlock_unlock(&w->lock); -+ -+broken: -+ *ppLink = NULL; -+ weak_link_do_unref(w); -+ return NULL; -+} -+ -+// Ignores a NULL c (so can be on the return path of both broken & live links) -+void ff_weak_link_unlock(struct ff_weak_link_client * c) -+{ -+ struct ff_weak_link_master * const w = weak_link_x(c); -+ if (w) -+ pthread_rwlock_unlock(&w->lock); -+} -+ -+ -diff --git a/libavcodec/weak_link.h b/libavcodec/weak_link.h -new file mode 100644 -index 0000000000..415b6a27a0 ---- /dev/null -+++ b/libavcodec/weak_link.h -@@ -0,0 +1,23 @@ -+struct ff_weak_link_master; -+struct ff_weak_link_client; -+ -+struct ff_weak_link_master * ff_weak_link_new(void * p); -+void ff_weak_link_break(struct ff_weak_link_master ** ppLink); -+ -+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w); -+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink); -+ -+// Returns NULL if link broken - in this case it will also zap -+// *ppLink and unref the weak_link. -+// Returns NULL if *ppLink is NULL (so a link once broken stays broken) -+// -+// The above does mean that there is a race if this is called simultainiously -+// by two threads using the same weak_link_client (so don't do that) -+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink); -+void ff_weak_link_unlock(struct ff_weak_link_client * c); -+ -+ -+ -+ -+ -+ - -From 824be1710ca96d97c86836fdac0e7dcd28a4b92e Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 27 Apr 2021 19:23:26 +0100 -Subject: [PATCH 012/136] Add v4l2_req V4L2 request H265 drm_prime decode - -Has the abiliy to switch between kernel API versions at runtime. This -could be removed later once teher is no chance of usage on an old -kernel. ---- - configure | 14 + - libavcodec/Makefile | 4 + - libavcodec/hevc-ctrls-v1.h | 229 +++++ - libavcodec/hevc-ctrls-v2.h | 257 +++++ - libavcodec/hevcdec.c | 10 + - libavcodec/hwaccels.h | 1 + - libavcodec/hwconfig.h | 2 + - libavcodec/v4l2_req_decode_q.c | 84 ++ - libavcodec/v4l2_req_decode_q.h | 25 + - libavcodec/v4l2_req_devscan.c | 449 +++++++++ - libavcodec/v4l2_req_devscan.h | 23 + - libavcodec/v4l2_req_dmabufs.c | 266 ++++++ - libavcodec/v4l2_req_dmabufs.h | 40 + - libavcodec/v4l2_req_hevc_v1.c | 3 + - libavcodec/v4l2_req_hevc_v2.c | 3 + - libavcodec/v4l2_req_hevc_vx.c | 1213 +++++++++++++++++++++++ - libavcodec/v4l2_req_media.c | 1596 +++++++++++++++++++++++++++++++ - libavcodec/v4l2_req_media.h | 151 +++ - libavcodec/v4l2_req_pollqueue.c | 361 +++++++ - libavcodec/v4l2_req_pollqueue.h | 18 + - libavcodec/v4l2_req_utils.h | 27 + - libavcodec/v4l2_request_hevc.c | 297 ++++++ - libavcodec/v4l2_request_hevc.h | 102 ++ - 23 files changed, 5175 insertions(+) - create mode 100644 libavcodec/hevc-ctrls-v1.h - create mode 100644 libavcodec/hevc-ctrls-v2.h - create mode 100644 libavcodec/v4l2_req_decode_q.c - create mode 100644 libavcodec/v4l2_req_decode_q.h - create mode 100644 libavcodec/v4l2_req_devscan.c - create mode 100644 libavcodec/v4l2_req_devscan.h - create mode 100644 libavcodec/v4l2_req_dmabufs.c - create mode 100644 libavcodec/v4l2_req_dmabufs.h - create mode 100644 libavcodec/v4l2_req_hevc_v1.c - create mode 100644 libavcodec/v4l2_req_hevc_v2.c - create mode 100644 libavcodec/v4l2_req_hevc_vx.c - create mode 100644 libavcodec/v4l2_req_media.c - create mode 100644 libavcodec/v4l2_req_media.h - create mode 100644 libavcodec/v4l2_req_pollqueue.c - create mode 100644 libavcodec/v4l2_req_pollqueue.h - create mode 100644 libavcodec/v4l2_req_utils.h - create mode 100644 libavcodec/v4l2_request_hevc.c - create mode 100644 libavcodec/v4l2_request_hevc.h - -diff --git a/configure b/configure -index 7712482bd5..199aa2b3d5 100755 ---- a/configure -+++ b/configure -@@ -281,6 +281,7 @@ External library support: - if openssl, gnutls or mbedtls is not used [no] - --enable-libtwolame enable MP2 encoding via libtwolame [no] - --enable-libuavs3d enable AVS3 decoding via libuavs3d [no] -+ --enable-libudev enable libudev [no] - --enable-libv4l2 enable libv4l2/v4l-utils [no] - --enable-libvidstab enable video stabilization using vid.stab [no] - --enable-libvmaf enable vmaf filter via libvmaf [no] -@@ -351,6 +352,7 @@ External library support: - --enable-omx-rpi enable OpenMAX IL code for Raspberry Pi [no] - --enable-rkmpp enable Rockchip Media Process Platform code [no] - --disable-v4l2-m2m disable V4L2 mem2mem code [autodetect] -+ --enable-v4l2-request enable V4L2 request API code [no] - --disable-vaapi disable Video Acceleration API (mainly Unix/Intel) code [autodetect] - --disable-vdpau disable Nvidia Video Decode and Presentation API for Unix code [autodetect] - --disable-videotoolbox disable VideoToolbox code [autodetect] -@@ -1858,6 +1860,7 @@ EXTERNAL_LIBRARY_LIST=" - libtheora - libtwolame - libuavs3d -+ libudev - libv4l2 - libvmaf - libvorbis -@@ -1914,6 +1917,7 @@ HWACCEL_LIBRARY_LIST=" - mmal - omx - opencl -+ v4l2_request - " - - DOCUMENT_LIST=" -@@ -3002,6 +3006,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext" - dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32" - ffnvcodec_deps_any="libdl LoadLibrary" - nvdec_deps="ffnvcodec" -+v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev" - vaapi_x11_deps="xlib_x11" - videotoolbox_hwaccel_deps="videotoolbox pthreads" - videotoolbox_hwaccel_extralibs="-framework QuartzCore" -@@ -3045,6 +3050,8 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC" - hevc_dxva2_hwaccel_select="hevc_decoder" - hevc_nvdec_hwaccel_deps="nvdec" - hevc_nvdec_hwaccel_select="hevc_decoder" -+hevc_v4l2request_hwaccel_deps="v4l2_request" -+hevc_v4l2request_hwaccel_select="hevc_decoder" - hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC" - hevc_vaapi_hwaccel_select="hevc_decoder" - hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC" -@@ -6696,6 +6703,7 @@ enabled libtwolame && require libtwolame twolame.h twolame_init -ltwolame - { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame || - die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; } - enabled libuavs3d && require_pkg_config libuavs3d "uavs3d >= 1.1.41" uavs3d.h uavs3d_decode -+enabled libudev && require_pkg_config libudev libudev libudev.h udev_new - enabled libv4l2 && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl - enabled libvidstab && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit - enabled libvmaf && require_pkg_config libvmaf "libvmaf >= 2.0.0" libvmaf.h vmaf_init -@@ -6798,6 +6806,10 @@ enabled rkmpp && { require_pkg_config rkmpp rockchip_mpp rockchip/r - { enabled libdrm || - die "ERROR: rkmpp requires --enable-libdrm"; } - } -+enabled v4l2_request && { enabled libdrm || -+ die "ERROR: v4l2-request requires --enable-libdrm"; } && -+ { enabled libudev || -+ die "ERROR: v4l2-request requires --enable-libudev"; } - enabled vapoursynth && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init - - -@@ -6880,6 +6892,8 @@ if enabled v4l2_m2m; then - check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;" - fi - -+check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns -+check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;" - check_headers sys/videoio.h - test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete - -diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index 389253f5d0..2d440b5648 100644 ---- a/libavcodec/Makefile -+++ b/libavcodec/Makefile -@@ -170,6 +170,8 @@ OBJS-$(CONFIG_VP3DSP) += vp3dsp.o - OBJS-$(CONFIG_VP56DSP) += vp56dsp.o - OBJS-$(CONFIG_VP8DSP) += vp8dsp.o - OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o -+OBJS-$(CONFIG_V4L2_REQUEST) += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\ -+ v4l2_req_devscan.o weak_link.o - OBJS-$(CONFIG_WMA_FREQS) += wma_freqs.o - OBJS-$(CONFIG_WMV2DSP) += wmv2dsp.o - -@@ -996,6 +998,8 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL) += dxva2_hevc.o - OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o - OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o - OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec.o -+OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o\ -+ v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o - OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o - OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o h265_profile_level.o - OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o -diff --git a/libavcodec/hevc-ctrls-v1.h b/libavcodec/hevc-ctrls-v1.h -new file mode 100644 -index 0000000000..72cbba0953 ---- /dev/null -+++ b/libavcodec/hevc-ctrls-v1.h -@@ -0,0 +1,229 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * These are the HEVC state controls for use with stateless HEVC -+ * codec drivers. -+ * -+ * It turns out that these structs are not stable yet and will undergo -+ * more changes. So keep them private until they are stable and ready to -+ * become part of the official public API. -+ */ -+ -+#ifndef _HEVC_CTRLS_H_ -+#define _HEVC_CTRLS_H_ -+ -+#include -+ -+/* The pixel format isn't stable at the moment and will likely be renamed. */ -+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ -+ -+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_MPEG_BASE + 1008) -+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_MPEG_BASE + 1009) -+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_MPEG_BASE + 1010) -+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_MPEG_BASE + 1011) -+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_MPEG_BASE + 1015) -+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_MPEG_BASE + 1016) -+ -+/* enum v4l2_ctrl_type type values */ -+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 -+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 -+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 -+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 -+ -+enum v4l2_mpeg_video_hevc_decode_mode { -+ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, -+ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, -+}; -+ -+enum v4l2_mpeg_video_hevc_start_code { -+ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, -+ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, -+}; -+ -+#define V4L2_HEVC_SLICE_TYPE_B 0 -+#define V4L2_HEVC_SLICE_TYPE_P 1 -+#define V4L2_HEVC_SLICE_TYPE_I 2 -+ -+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) -+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) -+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) -+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) -+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) -+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) -+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) -+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) -+ -+/* The controls are not stable at the moment and will likely be reworked. */ -+struct v4l2_ctrl_hevc_sps { -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ -+ __u16 pic_width_in_luma_samples; -+ __u16 pic_height_in_luma_samples; -+ __u8 bit_depth_luma_minus8; -+ __u8 bit_depth_chroma_minus8; -+ __u8 log2_max_pic_order_cnt_lsb_minus4; -+ __u8 sps_max_dec_pic_buffering_minus1; -+ __u8 sps_max_num_reorder_pics; -+ __u8 sps_max_latency_increase_plus1; -+ __u8 log2_min_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_luma_coding_block_size; -+ __u8 log2_min_luma_transform_block_size_minus2; -+ __u8 log2_diff_max_min_luma_transform_block_size; -+ __u8 max_transform_hierarchy_depth_inter; -+ __u8 max_transform_hierarchy_depth_intra; -+ __u8 pcm_sample_bit_depth_luma_minus1; -+ __u8 pcm_sample_bit_depth_chroma_minus1; -+ __u8 log2_min_pcm_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_pcm_luma_coding_block_size; -+ __u8 num_short_term_ref_pic_sets; -+ __u8 num_long_term_ref_pics_sps; -+ __u8 chroma_format_idc; -+ __u8 sps_max_sub_layers_minus1; -+ -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 0) -+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) -+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) -+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) -+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) -+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) -+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) -+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) -+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) -+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) -+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) -+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) -+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) -+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) -+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) -+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) -+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) -+ -+struct v4l2_ctrl_hevc_pps { -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ -+ __u8 num_extra_slice_header_bits; -+ __s8 init_qp_minus26; -+ __u8 diff_cu_qp_delta_depth; -+ __s8 pps_cb_qp_offset; -+ __s8 pps_cr_qp_offset; -+ __u8 num_tile_columns_minus1; -+ __u8 num_tile_rows_minus1; -+ __u8 column_width_minus1[20]; -+ __u8 row_height_minus1[22]; -+ __s8 pps_beta_offset_div2; -+ __s8 pps_tc_offset_div2; -+ __u8 log2_parallel_merge_level_minus2; -+ -+ __u8 padding[4]; -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01 -+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02 -+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03 -+ -+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 -+ -+struct v4l2_hevc_dpb_entry { -+ __u64 timestamp; -+ __u8 rps; -+ __u8 field_pic; -+ __u16 pic_order_cnt[2]; -+ __u8 padding[2]; -+}; -+ -+struct v4l2_hevc_pred_weight_table { -+ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __u8 padding[6]; -+ -+ __u8 luma_log2_weight_denom; -+ __s8 delta_chroma_log2_weight_denom; -+}; -+ -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) -+ -+struct v4l2_ctrl_hevc_slice_params { -+ __u32 bit_size; -+ __u32 data_bit_offset; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u32 slice_segment_addr; -+ __u32 num_entry_point_offsets; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ -+ __u8 nal_unit_type; -+ __u8 nuh_temporal_id_plus1; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u8 slice_type; -+ __u8 colour_plane_id; -+ __u16 slice_pic_order_cnt; -+ __u8 num_ref_idx_l0_active_minus1; -+ __u8 num_ref_idx_l1_active_minus1; -+ __u8 collocated_ref_idx; -+ __u8 five_minus_max_num_merge_cand; -+ __s8 slice_qp_delta; -+ __s8 slice_cb_qp_offset; -+ __s8 slice_cr_qp_offset; -+ __s8 slice_act_y_qp_offset; -+ __s8 slice_act_cb_qp_offset; -+ __s8 slice_act_cr_qp_offset; -+ __s8 slice_beta_offset_div2; -+ __s8 slice_tc_offset_div2; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ -+ __u8 pic_struct; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u8 num_active_dpb_entries; -+ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ -+ __u8 num_rps_poc_st_curr_before; -+ __u8 num_rps_poc_st_curr_after; -+ __u8 num_rps_poc_lt_curr; -+ -+ __u8 padding; -+ -+ __u32 entry_point_offset_minus1[256]; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ -+ struct v4l2_hevc_pred_weight_table pred_weight_table; -+ -+ __u64 flags; -+}; -+ -+struct v4l2_ctrl_hevc_scaling_matrix { -+ __u8 scaling_list_4x4[6][16]; -+ __u8 scaling_list_8x8[6][64]; -+ __u8 scaling_list_16x16[6][64]; -+ __u8 scaling_list_32x32[2][64]; -+ __u8 scaling_list_dc_coef_16x16[6]; -+ __u8 scaling_list_dc_coef_32x32[2]; -+}; -+ -+#endif -diff --git a/libavcodec/hevc-ctrls-v2.h b/libavcodec/hevc-ctrls-v2.h -new file mode 100644 -index 0000000000..7cbbbf055f ---- /dev/null -+++ b/libavcodec/hevc-ctrls-v2.h -@@ -0,0 +1,257 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * These are the HEVC state controls for use with stateless HEVC -+ * codec drivers. -+ * -+ * It turns out that these structs are not stable yet and will undergo -+ * more changes. So keep them private until they are stable and ready to -+ * become part of the official public API. -+ */ -+ -+#ifndef _HEVC_CTRLS_H_ -+#define _HEVC_CTRLS_H_ -+ -+#include -+ -+/* The pixel format isn't stable at the moment and will likely be renamed. */ -+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ -+ -+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008) -+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009) -+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010) -+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011) -+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012) -+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015) -+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016) -+ -+/* enum v4l2_ctrl_type type values */ -+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 -+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 -+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 -+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 -+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124 -+ -+enum v4l2_mpeg_video_hevc_decode_mode { -+ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, -+ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, -+}; -+ -+enum v4l2_mpeg_video_hevc_start_code { -+ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, -+ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, -+}; -+ -+#define V4L2_HEVC_SLICE_TYPE_B 0 -+#define V4L2_HEVC_SLICE_TYPE_P 1 -+#define V4L2_HEVC_SLICE_TYPE_I 2 -+ -+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) -+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) -+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) -+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) -+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) -+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) -+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) -+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) -+ -+/* The controls are not stable at the moment and will likely be reworked. */ -+struct v4l2_ctrl_hevc_sps { -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ -+ __u16 pic_width_in_luma_samples; -+ __u16 pic_height_in_luma_samples; -+ __u8 bit_depth_luma_minus8; -+ __u8 bit_depth_chroma_minus8; -+ __u8 log2_max_pic_order_cnt_lsb_minus4; -+ __u8 sps_max_dec_pic_buffering_minus1; -+ __u8 sps_max_num_reorder_pics; -+ __u8 sps_max_latency_increase_plus1; -+ __u8 log2_min_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_luma_coding_block_size; -+ __u8 log2_min_luma_transform_block_size_minus2; -+ __u8 log2_diff_max_min_luma_transform_block_size; -+ __u8 max_transform_hierarchy_depth_inter; -+ __u8 max_transform_hierarchy_depth_intra; -+ __u8 pcm_sample_bit_depth_luma_minus1; -+ __u8 pcm_sample_bit_depth_chroma_minus1; -+ __u8 log2_min_pcm_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_pcm_luma_coding_block_size; -+ __u8 num_short_term_ref_pic_sets; -+ __u8 num_long_term_ref_pics_sps; -+ __u8 chroma_format_idc; -+ __u8 sps_max_sub_layers_minus1; -+ -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) -+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) -+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) -+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) -+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) -+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) -+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) -+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) -+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) -+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) -+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) -+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) -+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) -+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) -+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) -+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) -+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) -+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) -+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) -+ -+struct v4l2_ctrl_hevc_pps { -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ -+ __u8 num_extra_slice_header_bits; -+ __u8 num_ref_idx_l0_default_active_minus1; -+ __u8 num_ref_idx_l1_default_active_minus1; -+ __s8 init_qp_minus26; -+ __u8 diff_cu_qp_delta_depth; -+ __s8 pps_cb_qp_offset; -+ __s8 pps_cr_qp_offset; -+ __u8 num_tile_columns_minus1; -+ __u8 num_tile_rows_minus1; -+ __u8 column_width_minus1[20]; -+ __u8 row_height_minus1[22]; -+ __s8 pps_beta_offset_div2; -+ __s8 pps_tc_offset_div2; -+ __u8 log2_parallel_merge_level_minus2; -+ -+ __u8 padding[4]; -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01 -+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02 -+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03 -+ -+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 -+ -+struct v4l2_hevc_dpb_entry { -+ __u64 timestamp; -+ __u8 rps; -+ __u8 field_pic; -+ __u16 pic_order_cnt[2]; -+ __u8 padding[2]; -+}; -+ -+struct v4l2_hevc_pred_weight_table { -+ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __u8 padding[6]; -+ -+ __u8 luma_log2_weight_denom; -+ __s8 delta_chroma_log2_weight_denom; -+}; -+ -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) -+ -+struct v4l2_ctrl_hevc_slice_params { -+ __u32 bit_size; -+ __u32 data_bit_offset; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u32 slice_segment_addr; -+ __u32 num_entry_point_offsets; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ -+ __u8 nal_unit_type; -+ __u8 nuh_temporal_id_plus1; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u8 slice_type; -+ __u8 colour_plane_id; -+ __u16 slice_pic_order_cnt; -+ __u8 num_ref_idx_l0_active_minus1; -+ __u8 num_ref_idx_l1_active_minus1; -+ __u8 collocated_ref_idx; -+ __u8 five_minus_max_num_merge_cand; -+ __s8 slice_qp_delta; -+ __s8 slice_cb_qp_offset; -+ __s8 slice_cr_qp_offset; -+ __s8 slice_act_y_qp_offset; -+ __s8 slice_act_cb_qp_offset; -+ __s8 slice_act_cr_qp_offset; -+ __s8 slice_beta_offset_div2; -+ __s8 slice_tc_offset_div2; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ -+ __u8 pic_struct; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ -+ __u8 padding[5]; -+ -+ __u32 entry_point_offset_minus1[256]; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ -+ struct v4l2_hevc_pred_weight_table pred_weight_table; -+ -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 -+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 -+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 -+ -+struct v4l2_ctrl_hevc_decode_params { -+ __s32 pic_order_cnt_val; -+ __u8 num_active_dpb_entries; -+ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 num_poc_st_curr_before; -+ __u8 num_poc_st_curr_after; -+ __u8 num_poc_lt_curr; -+ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u64 flags; -+}; -+ -+/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */ -+#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200) -+/* -+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP - -+ * the number of data (in bits) to skip in the -+ * slice segment header. -+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag" -+ * to before syntax element "slice_temporal_mvp_enabled_flag". -+ * If IDR, the skipped bits are just "pic_output_flag" -+ * (separate_colour_plane_flag is not supported). -+ */ -+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0) -+ -+struct v4l2_ctrl_hevc_scaling_matrix { -+ __u8 scaling_list_4x4[6][16]; -+ __u8 scaling_list_8x8[6][64]; -+ __u8 scaling_list_16x16[6][64]; -+ __u8 scaling_list_32x32[2][64]; -+ __u8 scaling_list_dc_coef_16x16[6]; -+ __u8 scaling_list_dc_coef_32x32[2]; -+}; -+ -+#endif -diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c -index 8a0246fa21..2867cb2e16 100644 ---- a/libavcodec/hevcdec.c -+++ b/libavcodec/hevcdec.c -@@ -416,6 +416,7 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) - #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \ - CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \ - CONFIG_HEVC_NVDEC_HWACCEL + \ -+ CONFIG_HEVC_V4L2REQUEST_HWACCEL + \ - CONFIG_HEVC_VAAPI_HWACCEL + \ - CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \ - CONFIG_HEVC_VDPAU_HWACCEL) -@@ -442,6 +443,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) - #endif - #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL - *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX; -+#endif -+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL -+ *fmt++ = AV_PIX_FMT_DRM_PRIME; - #endif - break; - case AV_PIX_FMT_YUV420P10: -@@ -463,6 +467,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) - #endif - #if CONFIG_HEVC_NVDEC_HWACCEL - *fmt++ = AV_PIX_FMT_CUDA; -+#endif -+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL -+ *fmt++ = AV_PIX_FMT_DRM_PRIME; - #endif - break; - case AV_PIX_FMT_YUV444P: -@@ -3749,6 +3756,9 @@ const FFCodec ff_hevc_decoder = { - #if CONFIG_HEVC_NVDEC_HWACCEL - HWACCEL_NVDEC(hevc), - #endif -+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL -+ HWACCEL_V4L2REQUEST(hevc), -+#endif - #if CONFIG_HEVC_VAAPI_HWACCEL - HWACCEL_VAAPI(hevc), - #endif -diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h -index aca55831f3..f32d1c4ec4 100644 ---- a/libavcodec/hwaccels.h -+++ b/libavcodec/hwaccels.h -@@ -40,6 +40,7 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel; - extern const AVHWAccel ff_hevc_d3d11va2_hwaccel; - extern const AVHWAccel ff_hevc_dxva2_hwaccel; - extern const AVHWAccel ff_hevc_nvdec_hwaccel; -+extern const AVHWAccel ff_hevc_v4l2request_hwaccel; - extern const AVHWAccel ff_hevc_vaapi_hwaccel; - extern const AVHWAccel ff_hevc_vdpau_hwaccel; - extern const AVHWAccel ff_hevc_videotoolbox_hwaccel; -diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h -index c43ad55245..b8aa383071 100644 ---- a/libavcodec/hwconfig.h -+++ b/libavcodec/hwconfig.h -@@ -71,6 +71,8 @@ typedef struct AVCodecHWConfigInternal { - HW_CONFIG_HWACCEL(1, 1, 0, D3D11, D3D11VA, ff_ ## codec ## _d3d11va2_hwaccel) - #define HWACCEL_NVDEC(codec) \ - HW_CONFIG_HWACCEL(1, 1, 0, CUDA, CUDA, ff_ ## codec ## _nvdec_hwaccel) -+#define HWACCEL_V4L2REQUEST(codec) \ -+ HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME, DRM, ff_ ## codec ## _v4l2request_hwaccel) - #define HWACCEL_VAAPI(codec) \ - HW_CONFIG_HWACCEL(1, 1, 1, VAAPI, VAAPI, ff_ ## codec ## _vaapi_hwaccel) - #define HWACCEL_VDPAU(codec) \ -diff --git a/libavcodec/v4l2_req_decode_q.c b/libavcodec/v4l2_req_decode_q.c -new file mode 100644 -index 0000000000..5b3fb958fa ---- /dev/null -+++ b/libavcodec/v4l2_req_decode_q.c -@@ -0,0 +1,84 @@ -+#include -+#include -+#include -+ -+#include "v4l2_req_decode_q.h" -+ -+int decode_q_in_q(const req_decode_ent * const d) -+{ -+ return d->in_q; -+} -+ -+void decode_q_add(req_decode_q * const q, req_decode_ent * const d) -+{ -+ pthread_mutex_lock(&q->q_lock); -+ if (!q->head) { -+ q->head = d; -+ q->tail = d; -+ d->prev = NULL; -+ } -+ else { -+ q->tail->next = d; -+ d->prev = q->tail; -+ q->tail = d; -+ } -+ d->next = NULL; -+ d->in_q = 1; -+ pthread_mutex_unlock(&q->q_lock); -+} -+ -+// Remove entry from Q - if head wake-up anything that was waiting -+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d) -+{ -+ int try_signal = 0; -+ -+ if (!d->in_q) -+ return; -+ -+ pthread_mutex_lock(&q->q_lock); -+ if (d->prev) -+ d->prev->next = d->next; -+ else { -+ try_signal = 1; // Only need to signal if we were head -+ q->head = d->next; -+ } -+ -+ if (d->next) -+ d->next->prev = d->prev; -+ else -+ q->tail = d->prev; -+ -+ // Not strictly needed but makes debug easier -+ d->next = NULL; -+ d->prev = NULL; -+ d->in_q = 0; -+ pthread_mutex_unlock(&q->q_lock); -+ -+ if (try_signal) -+ pthread_cond_broadcast(&q->q_cond); -+} -+ -+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d) -+{ -+ pthread_mutex_lock(&q->q_lock); -+ -+ while (q->head != d) -+ pthread_cond_wait(&q->q_cond, &q->q_lock); -+ -+ pthread_mutex_unlock(&q->q_lock); -+} -+ -+void decode_q_uninit(req_decode_q * const q) -+{ -+ pthread_mutex_destroy(&q->q_lock); -+ pthread_cond_destroy(&q->q_cond); -+} -+ -+void decode_q_init(req_decode_q * const q) -+{ -+ memset(q, 0, sizeof(*q)); -+ pthread_mutex_init(&q->q_lock, NULL); -+ pthread_cond_init(&q->q_cond, NULL); -+} -+ -+ -diff --git a/libavcodec/v4l2_req_decode_q.h b/libavcodec/v4l2_req_decode_q.h -new file mode 100644 -index 0000000000..af7bbe1de4 ---- /dev/null -+++ b/libavcodec/v4l2_req_decode_q.h -@@ -0,0 +1,25 @@ -+#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H -+#define AVCODEC_V4L2_REQ_DECODE_Q_H -+ -+typedef struct req_decode_ent { -+ struct req_decode_ent * next; -+ struct req_decode_ent * prev; -+ int in_q; -+} req_decode_ent; -+ -+typedef struct req_decode_q { -+ pthread_mutex_t q_lock; -+ pthread_cond_t q_cond; -+ req_decode_ent * head; -+ req_decode_ent * tail; -+} req_decode_q; -+ -+int decode_q_in_q(const req_decode_ent * const d); -+void decode_q_add(req_decode_q * const q, req_decode_ent * const d); -+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d); -+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d); -+void decode_q_uninit(req_decode_q * const q); -+void decode_q_init(req_decode_q * const q); -+ -+#endif -+ -diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c -new file mode 100644 -index 0000000000..cfa94d55c4 ---- /dev/null -+++ b/libavcodec/v4l2_req_devscan.c -@@ -0,0 +1,449 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+#include -+#include -+ -+#include "v4l2_req_devscan.h" -+#include "v4l2_req_utils.h" -+ -+struct decdev { -+ enum v4l2_buf_type src_type; -+ uint32_t src_fmt_v4l2; -+ const char * vname; -+ const char * mname; -+}; -+ -+struct devscan { -+ struct decdev env; -+ unsigned int dev_size; -+ unsigned int dev_count; -+ struct decdev *devs; -+}; -+ -+static int video_src_pixfmt_supported(uint32_t fmt) -+{ -+ return 1; -+} -+ -+static void v4l2_setup_format(struct v4l2_format *format, unsigned int type, -+ unsigned int width, unsigned int height, -+ unsigned int pixelformat) -+{ -+ unsigned int sizeimage; -+ -+ memset(format, 0, sizeof(*format)); -+ format->type = type; -+ -+ sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(type)) { -+ format->fmt.pix_mp.width = width; -+ format->fmt.pix_mp.height = height; -+ format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage; -+ format->fmt.pix_mp.pixelformat = pixelformat; -+ } else { -+ format->fmt.pix.width = width; -+ format->fmt.pix.height = height; -+ format->fmt.pix.sizeimage = sizeimage; -+ format->fmt.pix.pixelformat = pixelformat; -+ } -+} -+ -+static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat, -+ unsigned int width, unsigned int height) -+{ -+ struct v4l2_format format; -+ -+ v4l2_setup_format(&format, type, width, height, pixelformat); -+ -+ return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0; -+} -+ -+static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities) -+{ -+ struct v4l2_capability capability = { 0 }; -+ int rc; -+ -+ rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability); -+ if (rc < 0) -+ return -errno; -+ -+ if (capabilities != NULL) { -+ if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0) -+ *capabilities = capability.device_caps; -+ else -+ *capabilities = capability.capabilities; -+ } -+ -+ return 0; -+} -+ -+static int devscan_add(struct devscan *const scan, -+ enum v4l2_buf_type src_type, -+ uint32_t src_fmt_v4l2, -+ const char * vname, -+ const char * mname) -+{ -+ struct decdev *d; -+ -+ if (scan->dev_size <= scan->dev_count) { -+ unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2; -+ d = realloc(scan->devs, n * sizeof(*d)); -+ if (!d) -+ return -ENOMEM; -+ scan->devs = d; -+ scan->dev_size = n; -+ } -+ -+ d = scan->devs + scan->dev_count; -+ d->src_type = src_type; -+ d->src_fmt_v4l2 = src_fmt_v4l2; -+ d->vname = strdup(vname); -+ if (!d->vname) -+ return -ENOMEM; -+ d->mname = strdup(mname); -+ if (!d->mname) { -+ free((char *)d->vname); -+ return -ENOMEM; -+ } -+ ++scan->dev_count; -+ return 0; -+} -+ -+void devscan_delete(struct devscan **const pScan) -+{ -+ unsigned int i; -+ struct devscan * const scan = *pScan; -+ -+ if (!scan) -+ return; -+ *pScan = NULL; -+ -+ for (i = 0; i < scan->dev_count; ++i) { -+ free((char*)scan->devs[i].mname); -+ free((char*)scan->devs[i].vname); -+ } -+ free(scan->devs); -+ free(scan); -+} -+ -+#define REQ_BUF_CAPS (\ -+ V4L2_BUF_CAP_SUPPORTS_DMABUF |\ -+ V4L2_BUF_CAP_SUPPORTS_REQUESTS |\ -+ V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF) -+ -+static void probe_formats(void * const dc, -+ struct devscan *const scan, -+ const int fd, -+ const unsigned int type_v4l2, -+ const char *const mpath, -+ const char *const vpath) -+{ -+ unsigned int i; -+ for (i = 0;; ++i) { -+ struct v4l2_fmtdesc fmtdesc = { -+ .index = i, -+ .type = type_v4l2 -+ }; -+ struct v4l2_requestbuffers rbufs = { -+ .count = 0, -+ .type = type_v4l2, -+ .memory = V4L2_MEMORY_MMAP -+ }; -+ while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) { -+ if (errno == EINTR) -+ continue; -+ if (errno != EINVAL) -+ request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2); -+ return; -+ } -+ if (!video_src_pixfmt_supported(fmtdesc.pixelformat)) -+ continue; -+ -+ if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) { -+ request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat); -+ continue; -+ } -+ -+ while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) { -+ if (errno != EINTR) { -+ request_debug(dc, "%s: Reqbufs failed\n", vpath); -+ continue; -+ } -+ } -+ -+ if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) { -+ request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities); -+ continue; -+ } -+ -+ request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n", -+ mpath, vpath, fmtdesc.pixelformat, type_v4l2); -+ devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath); -+ } -+} -+ -+ -+static int probe_video_device(void * const dc, -+ struct udev_device *const device, -+ struct devscan *const scan, -+ const char *const mpath) -+{ -+ int ret; -+ unsigned int capabilities = 0; -+ int video_fd = -1; -+ -+ const char *path = udev_device_get_devnode(device); -+ if (!path) { -+ request_err(dc, "%s: get video device devnode failed\n", __func__); -+ ret = -EINVAL; -+ goto fail; -+ } -+ -+ video_fd = open(path, O_RDWR, 0); -+ if (video_fd == -1) { -+ ret = -errno; -+ request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno); -+ goto fail; -+ } -+ -+ ret = v4l2_query_capabilities(video_fd, &capabilities); -+ if (ret < 0) { -+ request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret); -+ goto fail; -+ } -+ -+ request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities); -+ -+ if (!(capabilities & V4L2_CAP_STREAMING)) { -+ request_debug(dc, "%s: missing required streaming capability\n", __func__); -+ ret = -EINVAL; -+ goto fail; -+ } -+ -+ if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) { -+ request_debug(dc, "%s: missing required mem2mem capability\n", __func__); -+ ret = -EINVAL; -+ goto fail; -+ } -+ -+ /* Should check capture formats too... */ -+ if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0) -+ probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path); -+ if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) -+ probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path); -+ -+ close(video_fd); -+ return 0; -+ -+fail: -+ if (video_fd >= 0) -+ close(video_fd); -+ return ret; -+} -+ -+static int probe_media_device(void * const dc, -+ struct udev_device *const device, -+ struct devscan *const scan) -+{ -+ int ret; -+ int rv; -+ struct media_device_info device_info = { 0 }; -+ struct media_v2_topology topology = { 0 }; -+ struct media_v2_interface *interfaces = NULL; -+ struct udev *udev = udev_device_get_udev(device); -+ struct udev_device *video_device; -+ dev_t devnum; -+ int media_fd = -1; -+ -+ const char *path = udev_device_get_devnode(device); -+ if (!path) { -+ request_err(dc, "%s: get media device devnode failed\n", __func__); -+ ret = -EINVAL; -+ goto fail; -+ } -+ -+ media_fd = open(path, O_RDWR, 0); -+ if (media_fd < 0) { -+ ret = -errno; -+ request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret); -+ goto fail; -+ } -+ -+ rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info); -+ if (rv < 0) { -+ ret = -errno; -+ request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret); -+ goto fail; -+ } -+ -+ rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology); -+ if (rv < 0) { -+ ret = -errno; -+ request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret); -+ goto fail; -+ } -+ -+ if (topology.num_interfaces <= 0) { -+ request_err(dc, "%s: media device has no interfaces\n", __func__); -+ ret = -EINVAL; -+ goto fail; -+ } -+ -+ interfaces = calloc(topology.num_interfaces, sizeof(*interfaces)); -+ if (!interfaces) { -+ request_err(dc, "%s: allocating media interface struct failed\n", __func__); -+ ret = -ENOMEM; -+ goto fail; -+ } -+ -+ topology.ptr_interfaces = (__u64)(uintptr_t)interfaces; -+ rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology); -+ if (rv < 0) { -+ ret = -errno; -+ request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret); -+ goto fail; -+ } -+ -+ for (int i = 0; i < topology.num_interfaces; i++) { -+ if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO) -+ continue; -+ -+ devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor); -+ video_device = udev_device_new_from_devnum(udev, 'c', devnum); -+ if (!video_device) { -+ ret = -errno; -+ request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device); -+ continue; -+ } -+ -+ ret = probe_video_device(dc, video_device, scan, path); -+ udev_device_unref(video_device); -+ -+ if (ret != 0) -+ goto fail; -+ } -+ -+fail: -+ free(interfaces); -+ if (media_fd != -1) -+ close(media_fd); -+ return ret; -+} -+ -+const char *decdev_media_path(const struct decdev *const dev) -+{ -+ return !dev ? NULL : dev->mname; -+} -+ -+const char *decdev_video_path(const struct decdev *const dev) -+{ -+ return !dev ? NULL : dev->vname; -+} -+ -+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev) -+{ -+ return !dev ? 0 : dev->src_type; -+} -+ -+uint32_t decdev_src_pixelformat(const struct decdev *const dev) -+{ -+ return !dev ? 0 : dev->src_fmt_v4l2; -+} -+ -+ -+const struct decdev *devscan_find(struct devscan *const scan, -+ const uint32_t src_fmt_v4l2) -+{ -+ unsigned int i; -+ -+ if (scan->env.mname && scan->env.vname) -+ return &scan->env; -+ -+ if (!src_fmt_v4l2) -+ return scan->dev_count ? scan->devs + 0 : NULL; -+ -+ for (i = 0; i != scan->dev_count; ++i) { -+ if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2) -+ return scan->devs + i; -+ } -+ return NULL; -+} -+ -+int devscan_build(void * const dc, struct devscan **pscan) -+{ -+ int ret; -+ struct udev *udev; -+ struct udev_enumerate *enumerate; -+ struct udev_list_entry *devices; -+ struct udev_list_entry *entry; -+ struct udev_device *device; -+ struct devscan * scan; -+ -+ *pscan = NULL; -+ -+ scan = calloc(1, sizeof(*scan)); -+ if (!scan) { -+ ret = -ENOMEM; -+ goto fail; -+ } -+ -+ scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH"); -+ scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH"); -+ if (scan->env.mname && scan->env.vname) { -+ request_info(dc, "Media/video device env overrides found: %s,%s\n", -+ scan->env.mname, scan->env.vname); -+ *pscan = scan; -+ return 0; -+ } -+ -+ udev = udev_new(); -+ if (!udev) { -+ request_err(dc, "%s: allocating udev context failed\n", __func__); -+ ret = -ENOMEM; -+ goto fail; -+ } -+ -+ enumerate = udev_enumerate_new(udev); -+ if (!enumerate) { -+ request_err(dc, "%s: allocating udev enumerator failed\n", __func__); -+ ret = -ENOMEM; -+ goto fail; -+ } -+ -+ udev_enumerate_add_match_subsystem(enumerate, "media"); -+ udev_enumerate_scan_devices(enumerate); -+ -+ devices = udev_enumerate_get_list_entry(enumerate); -+ udev_list_entry_foreach(entry, devices) { -+ const char *path = udev_list_entry_get_name(entry); -+ if (!path) -+ continue; -+ -+ device = udev_device_new_from_syspath(udev, path); -+ if (!device) -+ continue; -+ -+ probe_media_device(dc, device, scan); -+ udev_device_unref(device); -+ } -+ -+ udev_enumerate_unref(enumerate); -+ -+ *pscan = scan; -+ return 0; -+ -+fail: -+ udev_unref(udev); -+ devscan_delete(&scan); -+ return ret; -+} -+ -diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h -new file mode 100644 -index 0000000000..956d9234f1 ---- /dev/null -+++ b/libavcodec/v4l2_req_devscan.h -@@ -0,0 +1,23 @@ -+#ifndef _DEVSCAN_H_ -+#define _DEVSCAN_H_ -+ -+#include -+ -+struct devscan; -+struct decdev; -+enum v4l2_buf_type; -+ -+/* These return pointers to data in the devscan structure and so are vaild -+ * for the lifetime of that -+ */ -+const char *decdev_media_path(const struct decdev *const dev); -+const char *decdev_video_path(const struct decdev *const dev); -+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev); -+uint32_t decdev_src_pixelformat(const struct decdev *const dev); -+ -+const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2); -+ -+int devscan_build(void * const dc, struct devscan **pscan); -+void devscan_delete(struct devscan **const pScan); -+ -+#endif -diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c -new file mode 100644 -index 0000000000..ae6c648369 ---- /dev/null -+++ b/libavcodec/v4l2_req_dmabufs.c -@@ -0,0 +1,266 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "v4l2_req_dmabufs.h" -+#include "v4l2_req_utils.h" -+ -+#define DMABUF_NAME1 "/dev/dma_heap/linux,cma" -+#define DMABUF_NAME2 "/dev/dma_heap/reserved" -+ -+#define TRACE_ALLOC 0 -+ -+struct dmabufs_ctl { -+ int fd; -+ size_t page_size; -+}; -+ -+struct dmabuf_h { -+ int fd; -+ size_t size; -+ size_t len; -+ void * mapptr; -+}; -+ -+#if TRACE_ALLOC -+static unsigned int total_bufs = 0; -+static size_t total_size = 0; -+#endif -+ -+struct dmabuf_h * dmabuf_import(int fd, size_t size) -+{ -+ struct dmabuf_h *dh; -+ -+ fd = dup(fd); -+ if (fd < 0 || size == 0) -+ return NULL; -+ -+ dh = malloc(sizeof(*dh)); -+ if (!dh) { -+ close(fd); -+ return NULL; -+ } -+ -+ *dh = (struct dmabuf_h) { -+ .fd = fd, -+ .size = size, -+ .mapptr = MAP_FAILED -+ }; -+ -+#if TRACE_ALLOC -+ ++total_bufs; -+ total_size += dh->size; -+ request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); -+#endif -+ -+ return dh; -+} -+ -+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size) -+{ -+ struct dmabuf_h * dh; -+ struct dma_heap_allocation_data data = { -+ .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1), -+ .fd = 0, -+ .fd_flags = O_RDWR, -+ .heap_flags = 0 -+ }; -+ -+ if (old != NULL) { -+ if (old->size == data.len) { -+ return old; -+ } -+ dmabuf_free(old); -+ } -+ -+ if (size == 0 || -+ (dh = malloc(sizeof(*dh))) == NULL) -+ return NULL; -+ -+ while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) { -+ int err = errno; -+ request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n", -+ (uint64_t)data.len, -+ dbsc->fd, -+ err, -+ strerror(err)); -+ if (err == EINTR) -+ continue; -+ goto fail; -+ } -+ -+ *dh = (struct dmabuf_h){ -+ .fd = data.fd, -+ .size = (size_t)data.len, -+ .mapptr = MAP_FAILED -+ }; -+ -+#if TRACE_ALLOC -+ ++total_bufs; -+ total_size += dh->size; -+ request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); -+#endif -+ -+ return dh; -+ -+fail: -+ free(dh); -+ return NULL; -+} -+ -+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags) -+{ -+ struct dma_buf_sync sync = { -+ .flags = flags -+ }; -+ while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) { -+ const int err = errno; -+ if (errno == EINTR) -+ continue; -+ request_log("%s: ioctl failed: flags=%#x\n", __func__, flags); -+ return -err; -+ } -+ return 0; -+} -+ -+int dmabuf_write_start(struct dmabuf_h * const dh) -+{ -+ return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE); -+} -+ -+int dmabuf_write_end(struct dmabuf_h * const dh) -+{ -+ return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE); -+} -+ -+int dmabuf_read_start(struct dmabuf_h * const dh) -+{ -+ if (!dmabuf_map(dh)) -+ return -1; -+ return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ); -+} -+ -+int dmabuf_read_end(struct dmabuf_h * const dh) -+{ -+ return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ); -+} -+ -+ -+void * dmabuf_map(struct dmabuf_h * const dh) -+{ -+ if (!dh) -+ return NULL; -+ if (dh->mapptr != MAP_FAILED) -+ return dh->mapptr; -+ dh->mapptr = mmap(NULL, dh->size, -+ PROT_READ | PROT_WRITE, -+ MAP_SHARED | MAP_POPULATE, -+ dh->fd, 0); -+ if (dh->mapptr == MAP_FAILED) { -+ request_log("%s: Map failed\n", __func__); -+ return NULL; -+ } -+ return dh->mapptr; -+} -+ -+int dmabuf_fd(const struct dmabuf_h * const dh) -+{ -+ if (!dh) -+ return -1; -+ return dh->fd; -+} -+ -+size_t dmabuf_size(const struct dmabuf_h * const dh) -+{ -+ if (!dh) -+ return 0; -+ return dh->size; -+} -+ -+size_t dmabuf_len(const struct dmabuf_h * const dh) -+{ -+ if (!dh) -+ return 0; -+ return dh->len; -+} -+ -+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len) -+{ -+ dh->len = len; -+} -+ -+ -+ -+void dmabuf_free(struct dmabuf_h * dh) -+{ -+ if (!dh) -+ return; -+ -+#if TRACE_ALLOC -+ --total_bufs; -+ total_size -= dh->size; -+ request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); -+#endif -+ -+ if (dh->mapptr != MAP_FAILED) -+ munmap(dh->mapptr, dh->size); -+ while (close(dh->fd) == -1 && errno == EINTR) -+ /* loop */; -+ free(dh); -+} -+ -+struct dmabufs_ctl * dmabufs_ctl_new(void) -+{ -+ struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc)); -+ -+ if (!dbsc) -+ return NULL; -+ -+ while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 && -+ errno == EINTR) -+ /* Loop */; -+ -+ if (dbsc->fd == -1) { -+ while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 && -+ errno == EINTR) -+ /* Loop */; -+ if (dbsc->fd == -1) { -+ request_log("Unable to open either %s or %s\n", -+ DMABUF_NAME1, DMABUF_NAME2); -+ goto fail; -+ } -+ } -+ -+ dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE); -+ -+ return dbsc; -+ -+fail: -+ free(dbsc); -+ return NULL; -+} -+ -+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc) -+{ -+ struct dmabufs_ctl * const dbsc = *pDbsc; -+ -+ if (!dbsc) -+ return; -+ *pDbsc = NULL; -+ -+ while (close(dbsc->fd) == -1 && errno == EINTR) -+ /* loop */; -+ -+ free(dbsc); -+} -+ -+ -diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h -new file mode 100644 -index 0000000000..cfb17e801d ---- /dev/null -+++ b/libavcodec/v4l2_req_dmabufs.h -@@ -0,0 +1,40 @@ -+#ifndef DMABUFS_H -+#define DMABUFS_H -+ -+#include -+ -+struct dmabufs_ctl; -+struct dmabuf_h; -+ -+struct dmabufs_ctl * dmabufs_ctl_new(void); -+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc); -+ -+// Need not preserve old contents -+// On NULL return old buffer is freed -+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size); -+ -+static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) { -+ return dmabuf_realloc(dbsc, NULL, size); -+} -+/* Create from existing fd - dups(fd) */ -+struct dmabuf_h * dmabuf_import(int fd, size_t size); -+void * dmabuf_map(struct dmabuf_h * const dh); -+ -+/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */ -+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags); -+ -+int dmabuf_write_start(struct dmabuf_h * const dh); -+int dmabuf_write_end(struct dmabuf_h * const dh); -+int dmabuf_read_start(struct dmabuf_h * const dh); -+int dmabuf_read_end(struct dmabuf_h * const dh); -+ -+int dmabuf_fd(const struct dmabuf_h * const dh); -+/* Allocated size */ -+size_t dmabuf_size(const struct dmabuf_h * const dh); -+/* Bytes in use */ -+size_t dmabuf_len(const struct dmabuf_h * const dh); -+/* Set bytes in use */ -+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len); -+void dmabuf_free(struct dmabuf_h * dh); -+ -+#endif -diff --git a/libavcodec/v4l2_req_hevc_v1.c b/libavcodec/v4l2_req_hevc_v1.c -new file mode 100644 -index 0000000000..169b532832 ---- /dev/null -+++ b/libavcodec/v4l2_req_hevc_v1.c -@@ -0,0 +1,3 @@ -+#define HEVC_CTRLS_VERSION 1 -+#include "v4l2_req_hevc_vx.c" -+ -diff --git a/libavcodec/v4l2_req_hevc_v2.c b/libavcodec/v4l2_req_hevc_v2.c -new file mode 100644 -index 0000000000..42af98e156 ---- /dev/null -+++ b/libavcodec/v4l2_req_hevc_v2.c -@@ -0,0 +1,3 @@ -+#define HEVC_CTRLS_VERSION 2 -+#include "v4l2_req_hevc_vx.c" -+ -diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c -new file mode 100644 -index 0000000000..0ae03b10c4 ---- /dev/null -+++ b/libavcodec/v4l2_req_hevc_vx.c -@@ -0,0 +1,1213 @@ -+// File included by v4l2_req_hevc_v* - not compiled on its own -+ -+#include "decode.h" -+#include "hevcdec.h" -+#include "hwconfig.h" -+#include "internal.h" -+#include "thread.h" -+ -+#include "v4l2_request_hevc.h" -+ -+#if HEVC_CTRLS_VERSION == 1 -+#include "hevc-ctrls-v1.h" -+ -+// Fixup renamed entries -+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT -+ -+#elif HEVC_CTRLS_VERSION == 2 -+#include "hevc-ctrls-v2.h" -+#else -+#error Unknown HEVC_CTRLS_VERSION -+#endif -+ -+#include "libavutil/hwcontext_drm.h" -+ -+#include -+#include -+ -+#include "v4l2_req_devscan.h" -+#include "v4l2_req_dmabufs.h" -+#include "v4l2_req_pollqueue.h" -+#include "v4l2_req_media.h" -+#include "v4l2_req_utils.h" -+ -+// Attached to buf[0] in frame -+// Pooled in hwcontext so generally create once - 1/frame -+typedef struct V4L2MediaReqDescriptor { -+ AVDRMFrameDescriptor drm; -+ -+ // Media -+ uint64_t timestamp; -+ struct qent_dst * qe_dst; -+ -+ // Decode only - should be NULL by the time we emit the frame -+ struct req_decode_ent decode_ent; -+ -+ struct media_request *req; -+ struct qent_src *qe_src; -+ -+#if HEVC_CTRLS_VERSION >= 2 -+ struct v4l2_ctrl_hevc_decode_params dec; -+#endif -+ -+ size_t num_slices; -+ size_t alloced_slices; -+ struct v4l2_ctrl_hevc_slice_params * slice_params; -+ struct slice_info * slices; -+ -+} V4L2MediaReqDescriptor; -+ -+struct slice_info { -+ const uint8_t * ptr; -+ size_t len; // bytes -+}; -+ -+// Handy container for accumulating controls before setting -+struct req_controls { -+ int has_scaling; -+ struct timeval tv; -+ struct v4l2_ctrl_hevc_sps sps; -+ struct v4l2_ctrl_hevc_pps pps; -+ struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix; -+}; -+ -+//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 }; -+ -+ -+// Get an FFmpeg format from the v4l2 format -+static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format) -+{ -+ switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ? -+ format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) { -+ case V4L2_PIX_FMT_YUV420: -+ return AV_PIX_FMT_YUV420P; -+ case V4L2_PIX_FMT_NV12: -+ return AV_PIX_FMT_NV12; -+#if CONFIG_SAND -+ case V4L2_PIX_FMT_NV12_COL128: -+ return AV_PIX_FMT_RPI4_8; -+ case V4L2_PIX_FMT_NV12_10_COL128: -+ return AV_PIX_FMT_RPI4_10; -+#endif -+ default: -+ break; -+ } -+ return AV_PIX_FMT_NONE; -+} -+ -+static inline uint64_t frame_capture_dpb(const AVFrame * const frame) -+{ -+ const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0]; -+ return rd->timestamp; -+} -+ -+static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp) -+{ -+ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0]; -+ rd->timestamp = dpb_stamp; -+} -+ -+static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table) -+{ -+ int32_t luma_weight_denom, chroma_weight_denom; -+ const SliceHeader *sh = &h->sh; -+ -+ if (sh->slice_type == HEVC_SLICE_I || -+ (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) || -+ (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag)) -+ return; -+ -+ table->luma_log2_weight_denom = sh->luma_log2_weight_denom; -+ -+ if (h->ps.sps->chroma_format_idc) -+ table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom; -+ -+ luma_weight_denom = (1 << sh->luma_log2_weight_denom); -+ chroma_weight_denom = (1 << sh->chroma_log2_weight_denom); -+ -+ for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) { -+ table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom; -+ table->luma_offset_l0[i] = sh->luma_offset_l0[i]; -+ table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom; -+ table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom; -+ table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0]; -+ table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1]; -+ } -+ -+ if (sh->slice_type != HEVC_SLICE_B) -+ return; -+ -+ for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) { -+ table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom; -+ table->luma_offset_l1[i] = sh->luma_offset_l1[i]; -+ table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom; -+ table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom; -+ table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0]; -+ table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1]; -+ } -+} -+ -+static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp) -+{ -+ const HEVCFrame *frame; -+ int i; -+ -+ for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) { -+ frame = h->rps[ST_CURR_BEF].ref[i]; -+ if (frame && timestamp == frame_capture_dpb(frame->frame)) -+ return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE; -+ } -+ -+ for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) { -+ frame = h->rps[ST_CURR_AFT].ref[i]; -+ if (frame && timestamp == frame_capture_dpb(frame->frame)) -+ return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER; -+ } -+ -+ for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) { -+ frame = h->rps[LT_CURR].ref[i]; -+ if (frame && timestamp == frame_capture_dpb(frame->frame)) -+ return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR; -+ } -+ -+ return 0; -+} -+ -+static unsigned int -+get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame, -+ const struct v4l2_hevc_dpb_entry * const entries, -+ const unsigned int num_entries) -+{ -+ uint64_t timestamp; -+ -+ if (!frame) -+ return 0; -+ -+ timestamp = frame_capture_dpb(frame->frame); -+ -+ for (unsigned int i = 0; i < num_entries; i++) { -+ if (entries[i].timestamp == timestamp) -+ return i; -+ } -+ -+ return 0; -+} -+ -+static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx) -+{ -+ unsigned int z = 0; -+ while (idx--) { -+ if (*b++ == 0) { -+ ++z; -+ if (z >= 2 && *b == 3) { -+ ++b; -+ z = 0; -+ } -+ } -+ else { -+ z = 0; -+ } -+ } -+ return b; -+} -+ -+static int slice_add(V4L2MediaReqDescriptor * const rd) -+{ -+ if (rd->num_slices >= rd->alloced_slices) { -+ struct v4l2_ctrl_hevc_slice_params * p2; -+ struct slice_info * s2; -+ size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2; -+ -+ p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2)); -+ if (p2 == NULL) -+ return AVERROR(ENOMEM); -+ rd->slice_params = p2; -+ -+ s2 = av_realloc_array(rd->slices, n2, sizeof(*s2)); -+ if (s2 == NULL) -+ return AVERROR(ENOMEM); -+ rd->slices = s2; -+ -+ rd->alloced_slices = n2; -+ } -+ ++rd->num_slices; -+ return 0; -+} -+ -+static unsigned int -+fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries) -+{ -+ unsigned int i; -+ unsigned int n = 0; -+ const HEVCFrame * const pic = h->ref; -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) { -+ const HEVCFrame * const frame = &h->DPB[i]; -+ if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) { -+ struct v4l2_hevc_dpb_entry * const entry = entries + n++; -+ -+ entry->timestamp = frame_capture_dpb(frame->frame); -+ entry->rps = find_frame_rps_type(h, entry->timestamp); -+ entry->field_pic = frame->frame->interlaced_frame; -+ -+ /* TODO: Interleaved: Get the POC for each field. */ -+ entry->pic_order_cnt[0] = frame->poc; -+ entry->pic_order_cnt[1] = frame->poc; -+ } -+ } -+ return n; -+} -+ -+static void fill_slice_params(const HEVCContext * const h, -+#if HEVC_CTRLS_VERSION >= 2 -+ const struct v4l2_ctrl_hevc_decode_params * const dec, -+#endif -+ struct v4l2_ctrl_hevc_slice_params *slice_params, -+ uint32_t bit_size, uint32_t bit_offset) -+{ -+ const SliceHeader * const sh = &h->sh; -+#if HEVC_CTRLS_VERSION >= 2 -+ const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb; -+ const unsigned int dpb_n = dec->num_active_dpb_entries; -+#else -+ struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb; -+ unsigned int dpb_n; -+#endif -+ unsigned int i; -+ RefPicList *rpl; -+ -+ *slice_params = (struct v4l2_ctrl_hevc_slice_params) { -+ .bit_size = bit_size, -+ .data_bit_offset = bit_offset, -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ .slice_segment_addr = sh->slice_segment_addr, -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ -+ .nal_unit_type = h->nal_unit_type, -+ .nuh_temporal_id_plus1 = h->temporal_id + 1, -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ .slice_type = sh->slice_type, -+ .colour_plane_id = sh->colour_plane_id, -+ .slice_pic_order_cnt = h->ref->poc, -+ .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0, -+ .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0, -+ .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0, -+ .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand, -+ .slice_qp_delta = sh->slice_qp_delta, -+ .slice_cb_qp_offset = sh->slice_cb_qp_offset, -+ .slice_cr_qp_offset = sh->slice_cr_qp_offset, -+ .slice_act_y_qp_offset = 0, -+ .slice_act_cb_qp_offset = 0, -+ .slice_act_cr_qp_offset = 0, -+ .slice_beta_offset_div2 = sh->beta_offset / 2, -+ .slice_tc_offset_div2 = sh->tc_offset / 2, -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ -+ .pic_struct = h->sei.picture_timing.picture_struct, -+ -+#if HEVC_CTRLS_VERSION < 2 -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs, -+ .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs, -+ .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs, -+#endif -+ }; -+ -+ if (sh->slice_sample_adaptive_offset_flag[0]) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA; -+ -+ if (sh->slice_sample_adaptive_offset_flag[1]) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA; -+ -+ if (sh->slice_temporal_mvp_enabled_flag) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED; -+ -+ if (sh->mvd_l1_zero_flag) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO; -+ -+ if (sh->cabac_init_flag) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT; -+ -+ if (sh->collocated_list == L0) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0; -+ -+ if (sh->disable_deblocking_filter_flag) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED; -+ -+ if (sh->slice_loop_filter_across_slices_enabled_flag) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED; -+ -+ if (sh->dependent_slice_segment_flag) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT; -+ -+#if HEVC_CTRLS_VERSION < 2 -+ dpb_n = fill_dpb_entries(h, dpb); -+ slice_params->num_active_dpb_entries = dpb_n; -+#endif -+ -+ if (sh->slice_type != HEVC_SLICE_I) { -+ rpl = &h->ref->refPicList[0]; -+ for (i = 0; i < rpl->nb_refs; i++) -+ slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n); -+ } -+ -+ if (sh->slice_type == HEVC_SLICE_B) { -+ rpl = &h->ref->refPicList[1]; -+ for (i = 0; i < rpl->nb_refs; i++) -+ slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n); -+ } -+ -+ fill_pred_table(h, &slice_params->pred_weight_table); -+ -+ slice_params->num_entry_point_offsets = sh->num_entry_point_offsets; -+ if (slice_params->num_entry_point_offsets > 256) { -+ slice_params->num_entry_point_offsets = 256; -+ av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets); -+ } -+ -+ for (i = 0; i < slice_params->num_entry_point_offsets; i++) -+ slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1; -+} -+ -+#if HEVC_CTRLS_VERSION >= 2 -+static void -+fill_decode_params(const HEVCContext * const h, -+ struct v4l2_ctrl_hevc_decode_params * const dec) -+{ -+ unsigned int i; -+ -+ *dec = (struct v4l2_ctrl_hevc_decode_params){ -+ .pic_order_cnt_val = h->poc, -+ .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs, -+ .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs, -+ .num_poc_lt_curr = h->rps[LT_CURR].nb_refs, -+ }; -+ -+ dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb); -+ -+ // The docn does seem to ask that we fit our 32 bit signed POC into -+ // a U8 so... (To be fair 16 bits would be enough) -+ // Luckily we (Pi) don't use these fields -+ for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i) -+ dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc; -+ for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i) -+ dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc; -+ for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i) -+ dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc; -+ -+ if (IS_IRAP(h)) -+ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC; -+ if (IS_IDR(h)) -+ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC; -+ if (h->sh.no_output_of_prior_pics_flag) -+ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR; -+ -+} -+#endif -+ -+static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps) -+{ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ -+ *ctrl = (struct v4l2_ctrl_hevc_sps) { -+ .chroma_format_idc = sps->chroma_format_idc, -+ .pic_width_in_luma_samples = sps->width, -+ .pic_height_in_luma_samples = sps->height, -+ .bit_depth_luma_minus8 = sps->bit_depth - 8, -+ .bit_depth_chroma_minus8 = sps->bit_depth - 8, -+ .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4, -+ .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1, -+ .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics, -+ .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1, -+ .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3, -+ .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size, -+ .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2, -+ .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size, -+ .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter, -+ .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra, -+ .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1, -+ .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1, -+ .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3, -+ .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size, -+ .num_short_term_ref_pic_sets = sps->nb_st_rps, -+ .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps, -+ .chroma_format_idc = sps->chroma_format_idc, -+ .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1, -+ }; -+ -+ if (sps->separate_colour_plane_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE; -+ -+ if (sps->scaling_list_enable_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED; -+ -+ if (sps->amp_enabled_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED; -+ -+ if (sps->sao_enabled) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET; -+ -+ if (sps->pcm_enabled_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED; -+ -+ if (sps->pcm.loop_filter_disable_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED; -+ -+ if (sps->long_term_ref_pics_present_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT; -+ -+ if (sps->sps_temporal_mvp_enabled_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED; -+ -+ if (sps->sps_strong_intra_smoothing_enable_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED; -+} -+ -+static void fill_scaling_matrix(const ScalingList * const sl, -+ struct v4l2_ctrl_hevc_scaling_matrix * const sm) -+{ -+ unsigned int i; -+ -+ for (i = 0; i < 6; i++) { -+ unsigned int j; -+ -+ for (j = 0; j < 16; j++) -+ sm->scaling_list_4x4[i][j] = sl->sl[0][i][j]; -+ for (j = 0; j < 64; j++) { -+ sm->scaling_list_8x8[i][j] = sl->sl[1][i][j]; -+ sm->scaling_list_16x16[i][j] = sl->sl[2][i][j]; -+ if (i < 2) -+ sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j]; -+ } -+ sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i]; -+ if (i < 2) -+ sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3]; -+ } -+} -+ -+static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps) -+{ -+ uint64_t flags = 0; -+ -+ if (pps->dependent_slice_segments_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED; -+ -+ if (pps->output_flag_present_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT; -+ -+ if (pps->sign_data_hiding_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED; -+ -+ if (pps->cabac_init_present_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT; -+ -+ if (pps->constrained_intra_pred_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED; -+ -+ if (pps->transform_skip_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED; -+ -+ if (pps->cu_qp_delta_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED; -+ -+ if (pps->pic_slice_level_chroma_qp_offsets_present_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT; -+ -+ if (pps->weighted_pred_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED; -+ -+ if (pps->weighted_bipred_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED; -+ -+ if (pps->transquant_bypass_enable_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED; -+ -+ if (pps->tiles_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED; -+ -+ if (pps->entropy_coding_sync_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED; -+ -+ if (pps->loop_filter_across_tiles_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED; -+ -+ if (pps->seq_loop_filter_across_slices_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED; -+ -+ if (pps->deblocking_filter_override_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED; -+ -+ if (pps->disable_dbf) -+ flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER; -+ -+ if (pps->lists_modification_present_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT; -+ -+ if (pps->slice_header_extension_present_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ -+ *ctrl = (struct v4l2_ctrl_hevc_pps) { -+ .num_extra_slice_header_bits = pps->num_extra_slice_header_bits, -+ .init_qp_minus26 = pps->pic_init_qp_minus26, -+ .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth, -+ .pps_cb_qp_offset = pps->cb_qp_offset, -+ .pps_cr_qp_offset = pps->cr_qp_offset, -+ .pps_beta_offset_div2 = pps->beta_offset / 2, -+ .pps_tc_offset_div2 = pps->tc_offset / 2, -+ .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2, -+ .flags = flags -+ }; -+ -+ -+ if (pps->tiles_enabled_flag) { -+ ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1; -+ ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1; -+ -+ for (int i = 0; i < pps->num_tile_columns; i++) -+ ctrl->column_width_minus1[i] = pps->column_width[i] - 1; -+ -+ for (int i = 0; i < pps->num_tile_rows; i++) -+ ctrl->row_height_minus1[i] = pps->row_height[i] - 1; -+ } -+} -+ -+// Called before finally returning the frame to the user -+// Set corrupt flag here as this is actually the frame structure that -+// is going to the user (in MT land each thread has its own pool) -+static int frame_post_process(void *logctx, AVFrame *frame) -+{ -+ V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0]; -+ -+// av_log(NULL, AV_LOG_INFO, "%s\n", __func__); -+ frame->flags &= ~AV_FRAME_FLAG_CORRUPT; -+ if (rd->qe_dst) { -+ MediaBufsStatus stat = qent_dst_wait(rd->qe_dst); -+ if (stat != MEDIABUFS_STATUS_SUCCESS) { -+ av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__); -+ frame->flags |= AV_FRAME_FLAG_CORRUPT; -+ } -+ } -+ -+ return 0; -+} -+ -+static inline struct timeval cvt_dpb_to_tv(uint64_t t) -+{ -+ t /= 1000; -+ return (struct timeval){ -+ .tv_usec = t % 1000000, -+ .tv_sec = t / 1000000 -+ }; -+} -+ -+static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t) -+{ -+ return (uint64_t)t * 1000; -+} -+ -+static int v4l2_request_hevc_start_frame(AVCodecContext *avctx, -+ av_unused const uint8_t *buffer, -+ av_unused uint32_t size) -+{ -+ const HEVCContext *h = avctx->priv_data; -+ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0]; -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ -+// av_log(NULL, AV_LOG_INFO, "%s\n", __func__); -+ decode_q_add(&ctx->decode_q, &rd->decode_ent); -+ -+ rd->num_slices = 0; -+ ctx->timestamp++; -+ rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp); -+ -+ { -+ FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data; -+ fdd->post_process = frame_post_process; -+ } -+ -+ // qe_dst needs to be bound to the data buffer and only returned when that is -+ if (!rd->qe_dst) -+ { -+ if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__); -+ return AVERROR(ENOMEM); -+ } -+ } -+ -+ ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame -+ -+ return 0; -+} -+ -+// Object fd & size will be zapped by this & need setting later -+static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format) -+{ -+ AVDRMLayerDescriptor *layer = &desc->layers[0]; -+ unsigned int width; -+ unsigned int height; -+ unsigned int bpl; -+ uint32_t pixelformat; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) { -+ width = format->fmt.pix_mp.width; -+ height = format->fmt.pix_mp.height; -+ pixelformat = format->fmt.pix_mp.pixelformat; -+ bpl = format->fmt.pix_mp.plane_fmt[0].bytesperline; -+ } -+ else { -+ width = format->fmt.pix.width; -+ height = format->fmt.pix.height; -+ pixelformat = format->fmt.pix.pixelformat; -+ bpl = format->fmt.pix.bytesperline; -+ } -+ -+ switch (pixelformat) { -+ case V4L2_PIX_FMT_NV12: -+ layer->format = DRM_FORMAT_NV12; -+ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ break; -+#if CONFIG_SAND -+ case V4L2_PIX_FMT_NV12_COL128: -+ layer->format = DRM_FORMAT_NV12; -+ desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl); -+ break; -+ case V4L2_PIX_FMT_NV12_10_COL128: -+ layer->format = DRM_FORMAT_P030; -+ desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl); -+ break; -+#endif -+#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED -+ case V4L2_PIX_FMT_SUNXI_TILED_NV12: -+ layer->format = DRM_FORMAT_NV12; -+ desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED; -+ break; -+#endif -+#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15) -+ case V4L2_PIX_FMT_NV15: -+ layer->format = DRM_FORMAT_NV15; -+ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ break; -+#endif -+ case V4L2_PIX_FMT_NV16: -+ layer->format = DRM_FORMAT_NV16; -+ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ break; -+#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20) -+ case V4L2_PIX_FMT_NV20: -+ layer->format = DRM_FORMAT_NV20; -+ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ break; -+#endif -+ default: -+ return -1; -+ } -+ -+ desc->nb_objects = 1; -+ desc->objects[0].fd = -1; -+ desc->objects[0].size = 0; -+ -+ desc->nb_layers = 1; -+ layer->nb_planes = 2; -+ -+ layer->planes[0].object_index = 0; -+ layer->planes[0].offset = 0; -+ layer->planes[0].pitch = bpl; -+#if CONFIG_SAND -+ if (pixelformat == V4L2_PIX_FMT_NV12_COL128) { -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = height * 128; -+ layer->planes[0].pitch = width; -+ layer->planes[1].pitch = width; -+ } -+ else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) { -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = height * 128; -+ layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy -+ layer->planes[1].pitch = width * 2; -+ } -+ else -+#endif -+ { -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = layer->planes[0].pitch * height; -+ layer->planes[1].pitch = layer->planes[0].pitch; -+ } -+ -+ return 0; -+} -+ -+static int -+set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, -+ struct req_controls *const controls, -+#if HEVC_CTRLS_VERSION >= 2 -+ struct v4l2_ctrl_hevc_decode_params * const dec, -+#endif -+ struct v4l2_ctrl_hevc_slice_params * const slices, -+ const unsigned int slice_no, -+ const unsigned int slice_count) -+{ -+ int rv; -+ -+ struct v4l2_ext_control control[] = { -+ { -+ .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS, -+ .ptr = &controls->sps, -+ .size = sizeof(controls->sps), -+ }, -+ { -+ .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS, -+ .ptr = &controls->pps, -+ .size = sizeof(controls->pps), -+ }, -+#if HEVC_CTRLS_VERSION >= 2 -+ { -+ .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS, -+ .ptr = dec, -+ .size = sizeof(*dec), -+ }, -+#endif -+ { -+ .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, -+ .ptr = slices + slice_no, -+ .size = sizeof(*slices) * slice_count, -+ }, -+ // Optional -+ { -+ .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX, -+ .ptr = &controls->scaling_matrix, -+ .size = sizeof(controls->scaling_matrix), -+ }, -+ }; -+ -+ rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, -+ controls->has_scaling ? -+ FF_ARRAY_ELEMS(control) : -+ FF_ARRAY_ELEMS(control) - 1); -+ -+ return rv; -+} -+ -+static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) -+{ -+ const HEVCContext * const h = avctx->priv_data; -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0]; -+ int bcount = get_bits_count(&h->HEVClc->gb); -+ uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount; -+ -+ int rv; -+ struct slice_info * si; -+ -+ if ((rv = slice_add(rd)) != 0) -+ return rv; -+ -+ si = rd->slices + rd->num_slices - 1; -+ si->ptr = buffer; -+ si->len = size; -+ -+ if (ctx->multi_slice && rd->num_slices > 1) { -+ struct slice_info *const si0 = rd->slices; -+ const size_t offset = (buffer - si0->ptr); -+ boff += offset * 8; -+ size += offset; -+ si0->len = si->len + offset; -+ } -+ -+#if HEVC_CTRLS_VERSION >= 2 -+ if (rd->num_slices == 1) -+ fill_decode_params(h, &rd->dec); -+ fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff); -+#else -+ fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff); -+#endif -+ -+ return 0; -+} -+ -+static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx) -+{ -+ const HEVCContext * const h = avctx->priv_data; -+ if (h->ref != NULL) { -+ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0]; -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ -+ media_request_abort(&rd->req); -+ mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src); -+ -+ decode_q_remove(&ctx->decode_q, &rd->decode_ent); -+ } -+} -+ -+static int send_slice(AVCodecContext * const avctx, -+ V4L2MediaReqDescriptor * const rd, -+ struct req_controls *const controls, -+ const unsigned int i, const unsigned int j) -+{ -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ -+ struct slice_info *const si = rd->slices + i; -+ struct media_request * req = NULL; -+ struct qent_src * src = NULL; -+ MediaBufsStatus stat; -+ -+ if ((req = media_request_get(ctx->mpool)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__); -+ return AVERROR(ENOMEM); -+ } -+ -+ if (set_req_ctls(ctx, req, -+ controls, -+#if HEVC_CTRLS_VERSION >= 2 -+ &rd->dec, -+#endif -+ rd->slice_params, -+ i, j - i)) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__); -+ goto fail1; -+ } -+ -+ if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__); -+ goto fail1; -+ } -+ -+ if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__); -+ goto fail2; -+ } -+ -+ if (qent_src_params_set(src, &controls->tv)) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__); -+ goto fail2; -+ } -+ -+#warning ANNEX_B start code -+// if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) { -+// } -+ -+ stat = mediabufs_start_request(ctx->mbufs, &req, &src, -+ i == 0 ? rd->qe_dst : NULL, -+ j == rd->num_slices); -+ -+ if (stat != MEDIABUFS_STATUS_SUCCESS) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__); -+ return AVERROR_UNKNOWN; -+ } -+ return 0; -+ -+fail2: -+ mediabufs_src_qent_abort(ctx->mbufs, &src); -+fail1: -+ media_request_abort(&req); -+ return AVERROR_UNKNOWN; -+} -+ -+static int v4l2_request_hevc_end_frame(AVCodecContext *avctx) -+{ -+ const HEVCContext * const h = avctx->priv_data; -+ V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0]; -+ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; -+ struct req_controls rc; -+ unsigned int i; -+ int rv; -+ -+ // It is possible, though maybe a bug, to get an end_frame without -+ // a previous start_frame. If we do then give up. -+ if (!decode_q_in_q(&rd->decode_ent)) { -+ av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ { -+ const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ? -+ &h->ps.pps->scaling_list : -+ h->ps.sps->scaling_list_enable_flag ? -+ &h->ps.sps->scaling_list : NULL; -+ -+ -+ memset(&rc, 0, sizeof(rc)); -+ rc.tv = cvt_dpb_to_tv(rd->timestamp); -+ fill_sps(&rc.sps, h->ps.sps); -+ fill_pps(&rc.pps, h->ps.pps); -+ if (sl) { -+ rc.has_scaling = 1; -+ fill_scaling_matrix(sl, &rc.scaling_matrix); -+ } -+ } -+ -+ decode_q_wait(&ctx->decode_q, &rd->decode_ent); -+ -+ // qe_dst needs to be bound to the data buffer and only returned when that is -+ // Alloc almost certainly wants to be serialised if there is any chance of blocking -+ // so we get the next frame to be free in the thread that needs it for decode first. -+ // -+ // In our current world this probably isn't a concern but put it here anyway -+ if (!rd->qe_dst) -+ { -+ if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__); -+ rv = AVERROR(ENOMEM); -+ goto fail; -+ } -+ } -+ -+ // Send as slices -+ if (ctx->multi_slice) -+ { -+ if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0) -+ goto fail; -+ } -+ else -+ { -+ for (i = 0; i != rd->num_slices; ++i) { -+ if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0) -+ goto fail; -+ } -+ } -+ -+ // Set the drm_prime desriptor -+ drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs)); -+ rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0)); -+ rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0)); -+ -+ decode_q_remove(&ctx->decode_q, &rd->decode_ent); -+ return 0; -+ -+fail: -+ decode_q_remove(&ctx->decode_q, &rd->decode_ent); -+ return rv; -+} -+ -+// Initial check & init -+static int -+probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) -+{ -+ const HEVCContext *h = avctx->priv_data; -+ const HEVCSPS * const sps = h->ps.sps; -+ struct v4l2_ctrl_hevc_sps ctrl_sps; -+ unsigned int i; -+ -+ // Check for var slice array -+ struct v4l2_query_ext_ctrl qc[] = { -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS }, -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS }, -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS }, -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX }, -+#if HEVC_CTRLS_VERSION >= 2 -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS }, -+#endif -+ }; -+ // Order & size must match! -+ static const size_t ctrl_sizes[] = { -+ sizeof(struct v4l2_ctrl_hevc_slice_params), -+ sizeof(struct v4l2_ctrl_hevc_sps), -+ sizeof(struct v4l2_ctrl_hevc_pps), -+ sizeof(struct v4l2_ctrl_hevc_scaling_matrix), -+#if HEVC_CTRLS_VERSION >= 2 -+ sizeof(struct v4l2_ctrl_hevc_decode_params), -+#endif -+ }; -+ const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc); -+ -+ if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) { -+ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION); -+ return AVERROR(EINVAL); -+ } -+ for (i = 0; i != noof_ctrls; ++i) { -+ if (ctrl_sizes[i] != (size_t)qc[i].elem_size) { -+ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n", -+ HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size); -+ return AVERROR(EINVAL); -+ } -+ } -+ -+ fill_sps(&ctrl_sps, sps); -+ -+ if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0; -+ return 0; -+} -+ -+// Final init -+static int -+set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) -+{ -+ int ret; -+ -+ struct v4l2_query_ext_ctrl querys[] = { -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, }, -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, }, -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, }, -+ }; -+ -+ struct v4l2_ext_control ctrls[] = { -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, }, -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, }, -+ }; -+ -+ mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys)); -+ -+ ctx->decode_mode = querys[0].default_value; -+ -+ if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED && -+ ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) { -+ av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode); -+ return AVERROR(EINVAL); -+ } -+ -+ ctx->start_code = querys[1].default_value; -+ if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE && -+ ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) { -+ av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code); -+ return AVERROR(EINVAL); -+ } -+ -+ ctx->max_slices = querys[2].elems; -+ if (ctx->max_slices > MAX_SLICES) { -+ av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices); -+ return AVERROR(EINVAL); -+ } -+ -+ ctrls[0].value = ctx->decode_mode; -+ ctrls[1].value = ctx->start_code; -+ -+ ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls)); -+ return !ret ? 0 : AVERROR(-ret); -+} -+ -+static void v4l2_req_frame_free(void *opaque, uint8_t *data) -+{ -+ AVCodecContext *avctx = opaque; -+ V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data; -+ -+ av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data); -+ -+ qent_dst_unref(&rd->qe_dst); -+ -+ // We don't expect req or qe_src to be set -+ if (rd->req || rd->qe_src) -+ av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src); -+ -+ av_freep(&rd->slices); -+ av_freep(&rd->slice_params); -+ -+ av_free(rd); -+} -+ -+static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size) -+{ -+ AVCodecContext *avctx = opaque; -+// V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; -+// V4L2MediaReqDescriptor *req; -+ AVBufferRef *ref; -+ uint8_t *data; -+// int ret; -+ -+ data = av_mallocz(size); -+ if (!data) -+ return NULL; -+ -+ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data); -+ ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0); -+ if (!ref) { -+ av_freep(&data); -+ return NULL; -+ } -+ return ref; -+} -+ -+#if 0 -+static void v4l2_req_pool_free(void *opaque) -+{ -+ av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque); -+} -+ -+static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc) -+{ -+ av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool); -+ -+ av_buffer_pool_uninit(&hwfc->pool); -+} -+#endif -+ -+static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) -+{ -+ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; -+ AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data; -+ const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs); -+ -+ hwfc->format = AV_PIX_FMT_DRM_PRIME; -+ hwfc->sw_format = pixel_format_from_format(vfmt); -+ if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) { -+ hwfc->width = vfmt->fmt.pix_mp.width; -+ hwfc->height = vfmt->fmt.pix_mp.height; -+ } else { -+ hwfc->width = vfmt->fmt.pix.width; -+ hwfc->height = vfmt->fmt.pix.height; -+ } -+#if 0 -+ hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free); -+ if (!hwfc->pool) -+ return AVERROR(ENOMEM); -+ -+ hwfc->free = v4l2_req_hwframe_ctx_free; -+ -+ hwfc->initial_pool_size = 1; -+ -+ switch (avctx->codec_id) { -+ case AV_CODEC_ID_VP9: -+ hwfc->initial_pool_size += 8; -+ break; -+ case AV_CODEC_ID_VP8: -+ hwfc->initial_pool_size += 3; -+ break; -+ default: -+ hwfc->initial_pool_size += 2; -+ } -+#endif -+ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size); -+ -+ return 0; -+} -+ -+static int alloc_frame(AVCodecContext * avctx, AVFrame *frame) -+{ -+ int rv; -+ -+ frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor)); -+ if (!frame->buf[0]) -+ return AVERROR(ENOMEM); -+ -+ frame->data[0] = frame->buf[0]->data; -+ -+ frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx); -+ -+ if ((rv = ff_attach_decode_data(frame)) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n"); -+ av_frame_unref(frame); -+ return rv; -+ } -+ -+ return 0; -+} -+ -+const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = { -+ .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE, -+ .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION), -+ .probe = probe, -+ .set_controls = set_controls, -+ -+ .start_frame = v4l2_request_hevc_start_frame, -+ .decode_slice = v4l2_request_hevc_decode_slice, -+ .end_frame = v4l2_request_hevc_end_frame, -+ .abort_frame = v4l2_request_hevc_abort_frame, -+ .frame_params = frame_params, -+ .alloc_frame = alloc_frame, -+}; -+ -diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c -new file mode 100644 -index 0000000000..eb00ecb406 ---- /dev/null -+++ b/libavcodec/v4l2_req_media.c -@@ -0,0 +1,1596 @@ -+/* -+ * Copyright (C) 2018 Paul Kocialkowski -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the -+ * "Software"), to deal in the Software without restriction, including -+ * without limitation the rights to use, copy, modify, merge, publish, -+ * distribute, sub license, and/or sell copies of the Software, and to -+ * permit persons to whom the Software is furnished to do so, subject to -+ * the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the -+ * next paragraph) shall be included in all copies or substantial portions -+ * of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. -+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR -+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include "v4l2_req_dmabufs.h" -+#include "v4l2_req_media.h" -+#include "v4l2_req_pollqueue.h" -+#include "v4l2_req_utils.h" -+#include "weak_link.h" -+ -+ -+/* floor(log2(x)) */ -+static unsigned int log2_size(size_t x) -+{ -+ unsigned int n = 0; -+ -+ if (x & ~0xffff) { -+ n += 16; -+ x >>= 16; -+ } -+ if (x & ~0xff) { -+ n += 8; -+ x >>= 8; -+ } -+ if (x & ~0xf) { -+ n += 4; -+ x >>= 4; -+ } -+ if (x & ~3) { -+ n += 2; -+ x >>= 2; -+ } -+ return (x & ~1) ? n + 1 : n; -+} -+ -+static size_t round_up_size(const size_t x) -+{ -+ /* Admit no size < 256 */ -+ const unsigned int n = x < 256 ? 8 : log2_size(x) - 1; -+ -+ return x >= (3 << n) ? 4 << n : (3 << n); -+} -+ -+struct media_request; -+ -+struct media_pool { -+ int fd; -+ sem_t sem; -+ pthread_mutex_t lock; -+ struct media_request * free_reqs; -+ struct pollqueue * pq; -+}; -+ -+struct media_request { -+ struct media_request * next; -+ struct media_pool * mp; -+ int fd; -+ struct polltask * pt; -+}; -+ -+ -+static inline int do_trywait(sem_t *const sem) -+{ -+ while (sem_trywait(sem)) { -+ if (errno != EINTR) -+ return -errno; -+ } -+ return 0; -+} -+ -+static inline int do_wait(sem_t *const sem) -+{ -+ while (sem_wait(sem)) { -+ if (errno != EINTR) -+ return -errno; -+ } -+ return 0; -+} -+ -+static int request_buffers(int video_fd, unsigned int type, -+ enum v4l2_memory memory, unsigned int buffers_count) -+{ -+ struct v4l2_requestbuffers buffers; -+ int rc; -+ -+ memset(&buffers, 0, sizeof(buffers)); -+ buffers.type = type; -+ buffers.memory = memory; -+ buffers.count = buffers_count; -+ -+ rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers); -+ if (rc < 0) { -+ rc = -errno; -+ request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc)); -+ return rc; -+ } -+ -+ return 0; -+} -+ -+ -+static int set_stream(int video_fd, unsigned int type, bool enable) -+{ -+ enum v4l2_buf_type buf_type = type; -+ int rc; -+ -+ rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF, -+ &buf_type); -+ if (rc < 0) { -+ rc = -errno; -+ request_log("Unable to %sable stream: %s\n", -+ enable ? "en" : "dis", strerror(-rc)); -+ return rc; -+ } -+ -+ return 0; -+} -+ -+ -+ -+struct media_request * media_request_get(struct media_pool * const mp) -+{ -+ struct media_request *req = NULL; -+ -+ /* Timeout handled by poll code */ -+ if (do_wait(&mp->sem)) -+ return NULL; -+ -+ pthread_mutex_lock(&mp->lock); -+ req = mp->free_reqs; -+ if (req) { -+ mp->free_reqs = req->next; -+ req->next = NULL; -+ } -+ pthread_mutex_unlock(&mp->lock); -+ return req; -+} -+ -+int media_request_fd(const struct media_request * const req) -+{ -+ return req->fd; -+} -+ -+int media_request_start(struct media_request * const req) -+{ -+ while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1) -+ { -+ const int err = errno; -+ if (err == EINTR) -+ continue; -+ request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err)); -+ return -err; -+ } -+ -+ pollqueue_add_task(req->pt, 2000); -+ return 0; -+} -+ -+static void media_request_done(void *v, short revents) -+{ -+ struct media_request *const req = v; -+ struct media_pool *const mp = req->mp; -+ -+ /* ** Not sure what to do about timeout */ -+ -+ if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0) -+ request_log("Unable to reinit media request: %s\n", -+ strerror(errno)); -+ -+ pthread_mutex_lock(&mp->lock); -+ req->next = mp->free_reqs; -+ mp->free_reqs = req; -+ pthread_mutex_unlock(&mp->lock); -+ sem_post(&mp->sem); -+} -+ -+int media_request_abort(struct media_request ** const preq) -+{ -+ struct media_request * const req = *preq; -+ -+ if (req == NULL) -+ return 0; -+ *preq = NULL; -+ -+ media_request_done(req, 0); -+ return 0; -+} -+ -+static void delete_req_chain(struct media_request * const chain) -+{ -+ struct media_request * next = chain; -+ while (next) { -+ struct media_request * const req = next; -+ next = req->next; -+ if (req->pt) -+ polltask_delete(&req->pt); -+ if (req->fd != -1) -+ close(req->fd); -+ free(req); -+ } -+} -+ -+struct media_pool * media_pool_new(const char * const media_path, -+ struct pollqueue * const pq, -+ const unsigned int n) -+{ -+ struct media_pool * const mp = calloc(1, sizeof(*mp)); -+ unsigned int i; -+ -+ if (!mp) -+ goto fail0; -+ -+ mp->pq = pq; -+ pthread_mutex_init(&mp->lock, NULL); -+ mp->fd = open(media_path, O_RDWR | O_NONBLOCK); -+ if (mp->fd == -1) { -+ request_log("Failed to open '%s': %s\n", media_path, strerror(errno)); -+ goto fail1; -+ } -+ -+ for (i = 0; i != n; ++i) { -+ struct media_request * req = malloc(sizeof(*req)); -+ if (!req) -+ goto fail4; -+ -+ *req = (struct media_request){ -+ .next = mp->free_reqs, -+ .mp = mp, -+ .fd = -1 -+ }; -+ mp->free_reqs = req; -+ -+ if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) { -+ request_log("Failed to alloc request %d: %s\n", i, strerror(errno)); -+ goto fail4; -+ } -+ -+ req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req); -+ if (!req->pt) -+ goto fail4; -+ } -+ -+ sem_init(&mp->sem, 0, n); -+ -+ return mp; -+ -+fail4: -+ delete_req_chain(mp->free_reqs); -+ close(mp->fd); -+ pthread_mutex_destroy(&mp->lock); -+fail1: -+ free(mp); -+fail0: -+ return NULL; -+} -+ -+void media_pool_delete(struct media_pool ** pMp) -+{ -+ struct media_pool * const mp = *pMp; -+ -+ if (!mp) -+ return; -+ *pMp = NULL; -+ -+ delete_req_chain(mp->free_reqs); -+ close(mp->fd); -+ sem_destroy(&mp->sem); -+ pthread_mutex_destroy(&mp->lock); -+ free(mp); -+} -+ -+ -+#define INDEX_UNSET (~(uint32_t)0) -+ -+enum qent_status { -+ QENT_NEW = 0, // Initial state - shouldn't last -+ QENT_FREE, // On free chain -+ QENT_PENDING, // User has ent -+ QENT_WAITING, // On inuse -+ QENT_DONE, // Frame rx -+ QENT_ERROR, // Error -+ QENT_IMPORT -+}; -+ -+struct qent_base { -+ atomic_int ref_count; -+ struct qent_base *next; -+ struct qent_base *prev; -+ enum qent_status status; -+ uint32_t index; -+ struct dmabuf_h *dh[VIDEO_MAX_PLANES]; -+ struct timeval timestamp; -+}; -+ -+struct qent_src { -+ struct qent_base base; -+ int fixed_size; -+}; -+ -+struct qent_dst { -+ struct qent_base base; -+ bool waiting; -+ pthread_mutex_t lock; -+ pthread_cond_t cond; -+ struct ff_weak_link_client * mbc_wl; -+}; -+ -+struct qe_list_head { -+ struct qent_base *head; -+ struct qent_base *tail; -+}; -+ -+struct buf_pool { -+ pthread_mutex_t lock; -+ sem_t free_sem; -+ enum v4l2_buf_type buf_type; -+ struct qe_list_head free; -+ struct qe_list_head inuse; -+}; -+ -+ -+static inline struct qent_dst *base_to_dst(struct qent_base *be) -+{ -+ return (struct qent_dst *)be; -+} -+ -+static inline struct qent_src *base_to_src(struct qent_base *be) -+{ -+ return (struct qent_src *)be; -+} -+ -+ -+#define QENT_BASE_INITIALIZER {\ -+ .ref_count = ATOMIC_VAR_INIT(0),\ -+ .status = QENT_NEW,\ -+ .index = INDEX_UNSET\ -+} -+ -+static void qe_base_uninit(struct qent_base *const be) -+{ -+ unsigned int i; -+ for (i = 0; i != VIDEO_MAX_PLANES; ++i) { -+ dmabuf_free(be->dh[i]); -+ be->dh[i] = NULL; -+ } -+} -+ -+static void qe_src_free(struct qent_src *const be_src) -+{ -+ if (!be_src) -+ return; -+ qe_base_uninit(&be_src->base); -+ free(be_src); -+} -+ -+static struct qent_src * qe_src_new(void) -+{ -+ struct qent_src *const be_src = malloc(sizeof(*be_src)); -+ if (!be_src) -+ return NULL; -+ *be_src = (struct qent_src){ -+ .base = QENT_BASE_INITIALIZER -+ }; -+ return be_src; -+} -+ -+static void qe_dst_free(struct qent_dst *const be_dst) -+{ -+ if (!be_dst) -+ return; -+ -+ ff_weak_link_unref(&be_dst->mbc_wl); -+ pthread_cond_destroy(&be_dst->cond); -+ pthread_mutex_destroy(&be_dst->lock); -+ qe_base_uninit(&be_dst->base); -+ free(be_dst); -+} -+ -+static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl) -+{ -+ struct qent_dst *const be_dst = malloc(sizeof(*be_dst)); -+ if (!be_dst) -+ return NULL; -+ *be_dst = (struct qent_dst){ -+ .base = QENT_BASE_INITIALIZER, -+ .lock = PTHREAD_MUTEX_INITIALIZER, -+ .cond = PTHREAD_COND_INITIALIZER, -+ .mbc_wl = ff_weak_link_ref(wl) -+ }; -+ return be_dst; -+} -+ -+static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be) -+{ -+ if (ql->tail) -+ ql->tail->next = be; -+ else -+ ql->head = be; -+ be->prev = ql->tail; -+ be->next = NULL; -+ ql->tail = be; -+} -+ -+static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be) -+{ -+ if (!be) -+ return NULL; -+ -+ if (be->next) -+ be->next->prev = be->prev; -+ else -+ ql->tail = be->prev; -+ if (be->prev) -+ be->prev->next = be->next; -+ else -+ ql->head = be->next; -+ be->next = NULL; -+ be->prev = NULL; -+ return be; -+} -+ -+ -+static void bq_put_free(struct buf_pool *const bp, struct qent_base * be) -+{ -+ ql_add_tail(&bp->free, be); -+} -+ -+static struct qent_base * bq_get_free(struct buf_pool *const bp) -+{ -+ return ql_extract(&bp->free, bp->free.head); -+} -+ -+static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be) -+{ -+ return ql_extract(&bp->inuse, be); -+} -+ -+static struct qent_base * bq_get_inuse(struct buf_pool *const bp) -+{ -+ return ql_extract(&bp->inuse, bp->inuse.head); -+} -+ -+static void bq_free_all_free_src(struct buf_pool *const bp) -+{ -+ struct qent_base *be; -+ while ((be = bq_get_free(bp)) != NULL) -+ qe_src_free(base_to_src(be)); -+} -+ -+static void bq_free_all_inuse_src(struct buf_pool *const bp) -+{ -+ struct qent_base *be; -+ while ((be = bq_get_inuse(bp)) != NULL) -+ qe_src_free(base_to_src(be)); -+} -+ -+static void bq_free_all_free_dst(struct buf_pool *const bp) -+{ -+ struct qent_base *be; -+ while ((be = bq_get_free(bp)) != NULL) -+ qe_dst_free(base_to_dst(be)); -+} -+ -+static void queue_put_free(struct buf_pool *const bp, struct qent_base *be) -+{ -+ unsigned int i; -+ -+ pthread_mutex_lock(&bp->lock); -+ /* Clear out state vars */ -+ be->timestamp.tv_sec = 0; -+ be->timestamp.tv_usec = 0; -+ be->status = QENT_FREE; -+ for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) -+ dmabuf_len_set(be->dh[i], 0); -+ bq_put_free(bp, be); -+ pthread_mutex_unlock(&bp->lock); -+ sem_post(&bp->free_sem); -+} -+ -+static bool queue_is_inuse(const struct buf_pool *const bp) -+{ -+ return bp->inuse.tail != NULL; -+} -+ -+static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be) -+{ -+ if (!be) -+ return; -+ pthread_mutex_lock(&bp->lock); -+ ql_add_tail(&bp->inuse, be); -+ be->status = QENT_WAITING; -+ pthread_mutex_unlock(&bp->lock); -+} -+ -+static struct qent_base *queue_get_free(struct buf_pool *const bp) -+{ -+ struct qent_base *buf; -+ -+ if (do_wait(&bp->free_sem)) -+ return NULL; -+ pthread_mutex_lock(&bp->lock); -+ buf = bq_get_free(bp); -+ pthread_mutex_unlock(&bp->lock); -+ return buf; -+} -+ -+static struct qent_base *queue_tryget_free(struct buf_pool *const bp) -+{ -+ struct qent_base *buf; -+ -+ if (do_trywait(&bp->free_sem)) -+ return NULL; -+ pthread_mutex_lock(&bp->lock); -+ buf = bq_get_free(bp); -+ pthread_mutex_unlock(&bp->lock); -+ return buf; -+} -+ -+static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd) -+{ -+ struct qent_base *be; -+ -+ pthread_mutex_lock(&bp->lock); -+ /* Expect 1st in Q, but allow anywhere */ -+ for (be = bp->inuse.head; be; be = be->next) { -+ if (dmabuf_fd(be->dh[0]) == fd) { -+ bq_extract_inuse(bp, be); -+ break; -+ } -+ } -+ pthread_mutex_unlock(&bp->lock); -+ -+ return be; -+} -+ -+static void queue_delete(struct buf_pool *const bp) -+{ -+ sem_destroy(&bp->free_sem); -+ pthread_mutex_destroy(&bp->lock); -+ free(bp); -+} -+ -+static struct buf_pool* queue_new(const int vfd) -+{ -+ struct buf_pool *bp = calloc(1, sizeof(*bp)); -+ if (!bp) -+ return NULL; -+ pthread_mutex_init(&bp->lock, NULL); -+ sem_init(&bp->free_sem, 0, 0); -+ return bp; -+} -+ -+ -+struct mediabufs_ctl { -+ atomic_int ref_count; /* 0 is single ref for easier atomics */ -+ void * dc; -+ int vfd; -+ bool stream_on; -+ bool polling; -+ bool dst_fixed; // Dst Q is fixed size -+ pthread_mutex_t lock; -+ struct buf_pool * src; -+ struct buf_pool * dst; -+ struct polltask * pt; -+ struct pollqueue * pq; -+ struct ff_weak_link_master * this_wlm; -+ -+ struct v4l2_format src_fmt; -+ struct v4l2_format dst_fmt; -+}; -+ -+static int qe_v4l2_queue(struct qent_base *const be, -+ const int vfd, struct media_request *const mreq, -+ const struct v4l2_format *const fmt, -+ const bool is_dst, const bool hold_flag) -+{ -+ struct v4l2_buffer buffer = { -+ .type = fmt->type, -+ .memory = V4L2_MEMORY_DMABUF, -+ .index = be->index -+ }; -+ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ unsigned int i; -+ for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) { -+ if (is_dst) -+ dmabuf_len_set(be->dh[i], 0); -+ -+ /* *** Really need a pixdesc rather than a format so we can fill in data_offset */ -+ planes[i].length = dmabuf_size(be->dh[i]); -+ planes[i].bytesused = dmabuf_len(be->dh[i]); -+ planes[i].m.fd = dmabuf_fd(be->dh[i]); -+ } -+ buffer.m.planes = planes; -+ buffer.length = i; -+ } -+ else { -+ if (is_dst) -+ dmabuf_len_set(be->dh[0], 0); -+ -+ buffer.bytesused = dmabuf_len(be->dh[0]); -+ buffer.length = dmabuf_size(be->dh[0]); -+ buffer.m.fd = dmabuf_fd(be->dh[0]); -+ } -+ -+ if (!is_dst && mreq) { -+ buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD; -+ buffer.request_fd = media_request_fd(mreq); -+ if (hold_flag) -+ buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF; -+ } -+ -+ if (is_dst) -+ be->timestamp = (struct timeval){0,0}; -+ -+ buffer.timestamp = be->timestamp; -+ -+ while (ioctl(vfd, VIDIOC_QBUF, &buffer)) { -+ const int err = errno; -+ if (err != EINTR) { -+ request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err)); -+ return -err; -+ } -+ } -+ return 0; -+} -+ -+static struct qent_base * qe_dequeue(struct buf_pool *const bp, -+ const int vfd, -+ const struct v4l2_format * const f) -+{ -+ int fd; -+ struct qent_base *be; -+ int rc; -+ const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type); -+ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; -+ struct v4l2_buffer buffer = { -+ .type = f->type, -+ .memory = V4L2_MEMORY_DMABUF -+ }; -+ if (mp) { -+ buffer.length = f->fmt.pix_mp.num_planes; -+ buffer.m.planes = planes; -+ } -+ -+ while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 && -+ errno == EINTR) -+ /* Loop */; -+ if (rc) { -+ request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno)); -+ return NULL; -+ } -+ -+ fd = mp ? planes[0].m.fd : buffer.m.fd; -+ be = queue_find_extract_fd(bp, fd); -+ if (!be) { -+ request_log("Failed to find fd %d in Q\n", fd); -+ return NULL; -+ } -+ -+ be->timestamp = buffer.timestamp; -+ be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE; -+ return be; -+} -+ -+static void qe_dst_done(struct qent_dst * dst_be) -+{ -+ pthread_mutex_lock(&dst_be->lock); -+ dst_be->waiting = false; -+ pthread_cond_broadcast(&dst_be->cond); -+ pthread_mutex_unlock(&dst_be->lock); -+ -+ qent_dst_unref(&dst_be); -+} -+ -+static bool qe_dst_waiting(struct qent_dst *const dst_be) -+{ -+ bool waiting; -+ pthread_mutex_lock(&dst_be->lock); -+ waiting = dst_be->waiting; -+ dst_be->waiting = true; -+ pthread_mutex_unlock(&dst_be->lock); -+ return waiting; -+} -+ -+ -+static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc) -+{ -+ return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst); -+} -+ -+static void mediabufs_poll_cb(void * v, short revents) -+{ -+ struct mediabufs_ctl *mbc = v; -+ struct qent_src *src_be = NULL; -+ struct qent_dst *dst_be = NULL; -+ -+ if (!revents) -+ request_err(mbc->dc, "%s: Timeout\n", __func__); -+ -+ pthread_mutex_lock(&mbc->lock); -+ mbc->polling = false; -+ -+ if ((revents & POLLOUT) != 0) -+ src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt)); -+ if ((revents & POLLIN) != 0) -+ dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt)); -+ -+ /* Reschedule */ -+ if (mediabufs_wants_poll(mbc)) { -+ mbc->polling = true; -+ pollqueue_add_task(mbc->pt, 2000); -+ } -+ pthread_mutex_unlock(&mbc->lock); -+ -+ if (src_be) -+ queue_put_free(mbc->src, &src_be->base); -+ if (dst_be) -+ qe_dst_done(dst_be); -+} -+ -+int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp) -+{ -+ struct qent_base *const be = &be_src->base; -+ -+ be->timestamp = *timestamp; -+ return 0; -+} -+ -+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst) -+{ -+ return be_dst->base.timestamp; -+} -+ -+static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc) -+{ -+ if (!be->dh[0] || len > dmabuf_size(be->dh[0])) { -+ size_t newsize = round_up_size(len); -+ request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize); -+ if (!dbsc) { -+ request_log("%s: No dmbabuf_ctrl for realloc\n", __func__); -+ return -ENOMEM; -+ } -+ if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) { -+ request_log("%s: Realloc %zd failed\n", __func__, newsize); -+ return -ENOMEM; -+ } -+ } -+ return 0; -+} -+ -+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc) -+{ -+ struct qent_base *const be = &be_src->base; -+ return qent_base_realloc(be, len, dbsc); -+} -+ -+ -+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc) -+{ -+ void * dst; -+ struct qent_base *const be = &be_src->base; -+ int rv; -+ -+ // Realloc doesn't copy so don't alloc if offset != 0 -+ if ((rv = qent_base_realloc(be, offset + len, -+ be_src->fixed_size || offset ? NULL : dbsc)) != 0) -+ return rv; -+ -+ dmabuf_write_start(be->dh[0]); -+ dst = dmabuf_map(be->dh[0]); -+ if (!dst) -+ return -1; -+ memcpy((char*)dst + offset, src, len); -+ dmabuf_len_set(be->dh[0], len); -+ dmabuf_write_end(be->dh[0]); -+ return 0; -+} -+ -+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane) -+{ -+ const struct qent_base *const be = &be_dst->base; -+ -+ return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane]; -+} -+ -+int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane) -+{ -+ return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane))); -+} -+ -+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc, -+ struct media_request **const pmreq, -+ struct qent_src **const psrc_be, -+ struct qent_dst *const dst_be, -+ const bool is_final) -+{ -+ struct media_request * mreq = *pmreq; -+ struct qent_src *const src_be = *psrc_be; -+ -+ // Req & src are always both "consumed" -+ *pmreq = NULL; -+ *psrc_be = NULL; -+ -+ pthread_mutex_lock(&mbc->lock); -+ -+ if (!src_be) -+ goto fail1; -+ -+ if (dst_be) { -+ if (qe_dst_waiting(dst_be)) { -+ request_info(mbc->dc, "Request buffer already waiting on start\n"); -+ goto fail1; -+ } -+ dst_be->base.timestamp = (struct timeval){0,0}; -+ if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false)) -+ goto fail1; -+ -+ qent_dst_ref(dst_be); -+ queue_put_inuse(mbc->dst, &dst_be->base); -+ } -+ -+ if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final)) -+ goto fail1; -+ queue_put_inuse(mbc->src, &src_be->base); -+ -+ if (!mbc->polling && mediabufs_wants_poll(mbc)) { -+ mbc->polling = true; -+ pollqueue_add_task(mbc->pt, 2000); -+ } -+ pthread_mutex_unlock(&mbc->lock); -+ -+ if (media_request_start(mreq)) -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ -+ return MEDIABUFS_STATUS_SUCCESS; -+ -+fail1: -+ media_request_abort(&mreq); -+ if (src_be) -+ queue_put_free(mbc->src, &src_be->base); -+ -+// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q -+ if (dst_be) { -+ dst_be->base.status = QENT_ERROR; -+ qe_dst_done(dst_be); -+ } -+ pthread_mutex_unlock(&mbc->lock); -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+} -+ -+ -+static int qe_alloc_from_fmt(struct qent_base *const be, -+ struct dmabufs_ctl *const dbsc, -+ const struct v4l2_format *const fmt) -+{ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ unsigned int i; -+ for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) { -+ be->dh[i] = dmabuf_realloc(dbsc, be->dh[i], -+ fmt->fmt.pix_mp.plane_fmt[i].sizeimage); -+ /* On failure tidy up and die */ -+ if (!be->dh[i]) { -+ while (i--) { -+ dmabuf_free(be->dh[i]); -+ be->dh[i] = NULL; -+ } -+ return -1; -+ } -+ } -+ } -+ else { -+// be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage); -+ size_t size = fmt->fmt.pix.sizeimage; -+ be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size); -+ if (!be->dh[0]) -+ return -1; -+ } -+ return 0; -+} -+ -+static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd, -+ const enum v4l2_buf_type buftype, -+ uint32_t pixfmt, -+ const unsigned int width, const unsigned int height, -+ const size_t bufsize) -+{ -+ *fmt = (struct v4l2_format){.type = buftype}; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) { -+ fmt->fmt.pix_mp.width = width; -+ fmt->fmt.pix_mp.height = height; -+ fmt->fmt.pix_mp.pixelformat = pixfmt; -+ if (bufsize) { -+ fmt->fmt.pix_mp.num_planes = 1; -+ fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize; -+ } -+ } -+ else { -+ fmt->fmt.pix.width = width; -+ fmt->fmt.pix.height = height; -+ fmt->fmt.pix.pixelformat = pixfmt; -+ fmt->fmt.pix.sizeimage = bufsize; -+ } -+ -+ while (ioctl(fd, VIDIOC_S_FMT, fmt)) -+ if (errno != EINTR) -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ -+ // Treat anything where we don't get at least what we asked for as a fail -+ if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) { -+ if (fmt->fmt.pix_mp.width < width || -+ fmt->fmt.pix_mp.height < height || -+ fmt->fmt.pix_mp.pixelformat != pixfmt) { -+ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; -+ } -+ } -+ else { -+ if (fmt->fmt.pix.width < width || -+ fmt->fmt.pix.height < height || -+ fmt->fmt.pix.pixelformat != pixfmt) { -+ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; -+ } -+ } -+ -+ return MEDIABUFS_STATUS_SUCCESS; -+} -+ -+static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt, -+ const int fd, -+ const unsigned int type_v4l2, -+ const uint32_t flags_must, -+ const uint32_t flags_not, -+ const unsigned int width, -+ const unsigned int height, -+ mediabufs_dst_fmt_accept_fn *const accept_fn, -+ void *const accept_v) -+{ -+ unsigned int i; -+ -+ for (i = 0;; ++i) { -+ struct v4l2_fmtdesc fmtdesc = { -+ .index = i, -+ .type = type_v4l2 -+ }; -+ while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) { -+ if (errno != EINTR) -+ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; -+ } -+ if ((fmtdesc.flags & flags_must) != flags_must || -+ (fmtdesc.flags & flags_not)) -+ continue; -+ if (!accept_fn(accept_v, &fmtdesc)) -+ continue; -+ -+ if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat, -+ width, height, 0) == MEDIABUFS_STATUS_SUCCESS) -+ return MEDIABUFS_STATUS_SUCCESS; -+ } -+ return 0; -+} -+ -+ -+/* Wait for qent done */ -+ -+MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst) -+{ -+ struct qent_base *const be = &be_dst->base; -+ enum qent_status estat; -+ -+ pthread_mutex_lock(&be_dst->lock); -+ while (be_dst->waiting && -+ !pthread_cond_wait(&be_dst->cond, &be_dst->lock)) -+ /* Loop */; -+ estat = be->status; -+ pthread_mutex_unlock(&be_dst->lock); -+ -+ return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS : -+ estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR : -+ MEDIABUFS_ERROR_OPERATION_FAILED; -+} -+ -+const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no) -+{ -+ struct qent_base *const be = &be_dst->base; -+ return dmabuf_map(be->dh[buf_no]); -+} -+ -+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst) -+{ -+ struct qent_base *const be = &be_dst->base; -+ unsigned int i; -+ for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) { -+ if (dmabuf_read_start(be->dh[i])) { -+ while (i--) -+ dmabuf_read_end(be->dh[i]); -+ return MEDIABUFS_ERROR_ALLOCATION_FAILED; -+ } -+ } -+ return MEDIABUFS_STATUS_SUCCESS; -+} -+ -+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst) -+{ -+ struct qent_base *const be = &be_dst->base; -+ unsigned int i; -+ MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS; -+ -+ for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) { -+ if (dmabuf_read_end(be->dh[i])) -+ status = MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ return status; -+} -+ -+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst) -+{ -+ if (be_dst) -+ atomic_fetch_add(&be_dst->base.ref_count, 1); -+ return be_dst; -+} -+ -+void qent_dst_unref(struct qent_dst ** const pbe_dst) -+{ -+ struct qent_dst * const be_dst = *pbe_dst; -+ struct mediabufs_ctl * mbc; -+ if (!be_dst) -+ return; -+ *pbe_dst = NULL; -+ -+ if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0) -+ return; -+ -+ if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) { -+ queue_put_free(mbc->dst, &be_dst->base); -+ ff_weak_link_unlock(be_dst->mbc_wl); -+ } -+ else { -+ qe_dst_free(be_dst); -+ } -+} -+ -+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst, -+ unsigned int plane, -+ int fd, size_t size) -+{ -+ struct qent_base *const be = &be_dst->base; -+ struct dmabuf_h * dh; -+ -+ if (be->status != QENT_IMPORT || be->dh[plane]) -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ -+ dh = dmabuf_import(fd, size); -+ if (!dh) -+ return MEDIABUFS_ERROR_ALLOCATION_FAILED; -+ -+ be->dh[plane] = dh; -+ return MEDIABUFS_STATUS_SUCCESS; -+} -+ -+// Returns noof buffers created, -ve for error -+static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[]) -+{ -+ unsigned int i; -+ -+ struct v4l2_create_buffers cbuf = { -+ .count = n, -+ .memory = V4L2_MEMORY_DMABUF, -+ .format = mbc->dst_fmt, -+ }; -+ -+ while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) { -+ const int err = -errno; -+ if (err != EINTR) { -+ request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__); -+ return -err; -+ } -+ } -+ -+ if (cbuf.count != n) -+ request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n); -+ -+ for (i = 0; i != cbuf.count; ++i) -+ qes[i]->base.index = cbuf.index + i; -+ -+ return cbuf.count; -+} -+ -+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc) -+{ -+ struct qent_dst * be_dst; -+ -+ if (mbc == NULL) { -+ be_dst = qe_dst_new(NULL); -+ if (be_dst) -+ be_dst->base.status = QENT_IMPORT; -+ return be_dst; -+ } -+ -+ if (mbc->dst_fixed) { -+ be_dst = base_to_dst(queue_get_free(mbc->dst)); -+ if (!be_dst) -+ return NULL; -+ } -+ else { -+ be_dst = base_to_dst(queue_tryget_free(mbc->dst)); -+ if (!be_dst) { -+ be_dst = qe_dst_new(mbc->this_wlm); -+ if (!be_dst) -+ return NULL; -+ -+ if (create_dst_bufs(mbc, 1, &be_dst) != 1) { -+ qe_dst_free(be_dst); -+ return NULL; -+ } -+ } -+ } -+ -+ if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) { -+ /* Given how create buf works we can't uncreate it on alloc failure -+ * all we can do is put it on the free Q -+ */ -+ queue_put_free(mbc->dst, &be_dst->base); -+ return NULL; -+ } -+ -+ be_dst->base.status = QENT_PENDING; -+ atomic_store(&be_dst->base.ref_count, 0); -+ return be_dst; -+} -+ -+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc) -+{ -+ return &mbc->dst_fmt; -+} -+ -+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc, -+ const unsigned int width, -+ const unsigned int height, -+ mediabufs_dst_fmt_accept_fn *const accept_fn, -+ void *const accept_v) -+{ -+ MediaBufsStatus status; -+ unsigned int i; -+ const enum v4l2_buf_type buf_type = mbc->dst_fmt.type; -+ static const struct { -+ unsigned int flags_must; -+ unsigned int flags_not; -+ } trys[] = { -+ {0, V4L2_FMT_FLAG_EMULATED}, -+ {V4L2_FMT_FLAG_EMULATED, 0}, -+ }; -+ for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) { -+ status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd, -+ buf_type, -+ trys[i].flags_must, -+ trys[i].flags_not, -+ width, height, accept_fn, accept_v); -+ if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE) -+ return status; -+ } -+ -+ if (status != MEDIABUFS_STATUS_SUCCESS) -+ return status; -+ -+ /* Try to create a buffer - don't alloc */ -+ return status; -+} -+ -+// ** This is a mess if we get partial alloc but without any way to remove -+// individual V4L2 Q members we are somewhat stuffed -+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed) -+{ -+ unsigned int i; -+ int a = 0; -+ unsigned int qc; -+ struct qent_dst * qes[32]; -+ -+ if (n > 32) -+ return MEDIABUFS_ERROR_ALLOCATION_FAILED; -+ -+ // Create qents first as it is hard to get rid of the V4L2 buffers on error -+ for (qc = 0; qc != n; ++qc) -+ { -+ if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL) -+ goto fail; -+ } -+ -+ if ((a = create_dst_bufs(mbc, n, qes)) < 0) -+ goto fail; -+ -+ for (i = 0; i != a; ++i) -+ queue_put_free(mbc->dst, &qes[i]->base); -+ -+ if (a != n) -+ goto fail; -+ -+ mbc->dst_fixed = fixed; -+ return MEDIABUFS_STATUS_SUCCESS; -+ -+fail: -+ for (i = (a < 0 ? 0 : a); i != qc; ++i) -+ qe_dst_free(qes[i]); -+ -+ return MEDIABUFS_ERROR_ALLOCATION_FAILED; -+} -+ -+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc) -+{ -+ struct qent_base * buf = queue_get_free(mbc->src); -+ buf->status = QENT_PENDING; -+ return base_to_src(buf); -+} -+ -+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src) -+{ -+ struct qent_src *const qe_src = *pqe_src; -+ if (!qe_src) -+ return; -+ *pqe_src = NULL; -+ queue_put_free(mbc->src, &qe_src->base); -+} -+ -+/* src format must have been set up before this */ -+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc, -+ struct dmabufs_ctl * const dbsc, -+ unsigned int n) -+{ -+ unsigned int i; -+ struct v4l2_requestbuffers req = { -+ .count = n, -+ .type = mbc->src_fmt.type, -+ .memory = V4L2_MEMORY_DMABUF -+ }; -+ -+ bq_free_all_free_src(mbc->src); -+ while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) { -+ if (errno != EINTR) { -+ request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__); -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ } -+ -+ if (n > req.count) { -+ request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n); -+ n = req.count; -+ } -+ -+ for (i = 0; i != n; ++i) { -+ struct qent_src *const be_src = qe_src_new(); -+ if (!be_src) { -+ request_err(mbc->dc, "Failed to create src be %d\n", i); -+ goto fail; -+ } -+ if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) { -+ qe_src_free(be_src); -+ goto fail; -+ } -+ be_src->base.index = i; -+ be_src->fixed_size = !mediabufs_src_resizable(mbc); -+ -+ queue_put_free(mbc->src, &be_src->base); -+ } -+ -+ return MEDIABUFS_STATUS_SUCCESS; -+ -+fail: -+ bq_free_all_free_src(mbc->src); -+ req.count = 0; -+ while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 && -+ errno == EINTR) -+ /* Loop */; -+ -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+} -+ -+ -+ -+/* -+ * Set stuff order: -+ * Set src fmt -+ * Set parameters (sps) on vfd -+ * Negotiate dst format (dst_fmt_set) -+ * Create src buffers -+ * Alloc a dst buffer or Create dst slots -+*/ -+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc) -+{ -+ if (mbc->stream_on) -+ return MEDIABUFS_STATUS_SUCCESS; -+ -+ if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) { -+ request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type); -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ -+ if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) { -+ request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type); -+ set_stream(mbc->vfd, mbc->src_fmt.type, false); -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ -+ mbc->stream_on = true; -+ return MEDIABUFS_STATUS_SUCCESS; -+} -+ -+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc) -+{ -+ MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS; -+ -+ if (!mbc->stream_on) -+ return MEDIABUFS_STATUS_SUCCESS; -+ -+ if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) { -+ request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type); -+ status = MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ -+ if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) { -+ request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type); -+ status = MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ -+ mbc->stream_on = false; -+ return status; -+} -+ -+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n) -+{ -+ struct v4l2_ext_controls controls = { -+ .controls = control_array, -+ .count = n -+ }; -+ -+ if (mreq) { -+ controls.which = V4L2_CTRL_WHICH_REQUEST_VAL; -+ controls.request_fd = media_request_fd(mreq); -+ } -+ -+ while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls)) -+ { -+ const int err = errno; -+ if (err != EINTR) { -+ request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err)); -+ return -err; -+ } -+ } -+ -+ return 0; -+} -+ -+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc, -+ struct media_request * const mreq, -+ unsigned int id, void *data, -+ unsigned int size) -+{ -+ struct v4l2_ext_control control = { -+ .id = id, -+ .ptr = data, -+ .size = size -+ }; -+ -+ int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1); -+ return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED; -+} -+ -+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc, -+ enum v4l2_buf_type buf_type, -+ const uint32_t pixfmt, -+ const uint32_t width, const uint32_t height, -+ const size_t bufsize) -+{ -+ MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize); -+ if (rv != MEDIABUFS_STATUS_SUCCESS) -+ request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height); -+ -+ return rv; -+} -+ -+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n) -+{ -+ int rv = 0; -+ while (n--) { -+ while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) { -+ const int err = errno; -+ if (err != EINTR) { -+ // Often used for probing - errors are to be expected -+ request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err); -+ ctrls->type = 0; // 0 is invalid -+ rv = -err; -+ break; -+ } -+ } -+ ++ctrls; -+ } -+ return rv; -+} -+ -+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc) -+{ -+ // Single planar OUTPUT can only take exact size buffers -+ // Multiplanar will take larger than negotiated -+ return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type); -+} -+ -+static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc) -+{ -+ if (!mbc) -+ return; -+ -+ // Break the weak link first -+ ff_weak_link_break(&mbc->this_wlm); -+ -+ polltask_delete(&mbc->pt); -+ -+ mediabufs_stream_off(mbc); -+ -+ // Empty v4l2 buffer stash -+ request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0); -+ request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0); -+ -+ bq_free_all_free_src(mbc->src); -+ bq_free_all_inuse_src(mbc->src); -+ bq_free_all_free_dst(mbc->dst); -+ -+ { -+ struct qent_dst *dst_be; -+ while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) { -+ dst_be->base.timestamp = (struct timeval){0}; -+ dst_be->base.status = QENT_ERROR; -+ qe_dst_done(dst_be); -+ } -+ } -+ -+ queue_delete(mbc->dst); -+ queue_delete(mbc->src); -+ close(mbc->vfd); -+ pthread_mutex_destroy(&mbc->lock); -+ -+ free(mbc); -+} -+ -+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc) -+{ -+ atomic_fetch_add(&mbc->ref_count, 1); -+ return mbc; -+} -+ -+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc) -+{ -+ struct mediabufs_ctl *const mbc = *pmbc; -+ int n; -+ -+ if (!mbc) -+ return; -+ *pmbc = NULL; -+ n = atomic_fetch_sub(&mbc->ref_count, 1); -+ if (n) -+ return; -+ mediabufs_ctl_delete(mbc); -+} -+ -+static int set_capabilities(struct mediabufs_ctl *const mbc) -+{ -+ struct v4l2_capability capability = { 0 }; -+ uint32_t caps; -+ -+ if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &capability)) { -+ int err = errno; -+ request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err)); -+ return -err; -+ } -+ -+ caps = (capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ? -+ capability.device_caps : -+ capability.capabilities; -+ -+ if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) { -+ mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; -+ mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; -+ } -+ else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) { -+ mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; -+ mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; -+ } -+ else { -+ request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps); -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+/* One of these per context */ -+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq) -+{ -+ struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc)); -+ -+ if (!mbc) -+ return NULL; -+ -+ mbc->dc = dc; -+ // Default mono planar -+ mbc->pq = pq; -+ pthread_mutex_init(&mbc->lock, NULL); -+ -+ /* Pick a default - could we scan for this? */ -+ if (vpath == NULL) -+ vpath = "/dev/media0"; -+ -+ while ((mbc->vfd = open(vpath, O_RDWR)) == -1) -+ { -+ const int err = errno; -+ if (err != EINTR) { -+ request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err)); -+ goto fail0; -+ } -+ } -+ -+ if (set_capabilities(mbc)) { -+ request_err(dc, "Bad capabilities for video dev '%s'\n", vpath); -+ goto fail1; -+ } -+ -+ mbc->src = queue_new(mbc->vfd); -+ if (!mbc->src) -+ goto fail1; -+ mbc->dst = queue_new(mbc->vfd); -+ if (!mbc->dst) -+ goto fail2; -+ mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc); -+ if (!mbc->pt) -+ goto fail3; -+ mbc->this_wlm = ff_weak_link_new(mbc); -+ if (!mbc->this_wlm) -+ goto fail4; -+ -+ /* Cannot add polltask now - polling with nothing pending -+ * generates infinite error polls -+ */ -+ return mbc; -+ -+fail4: -+ polltask_delete(&mbc->pt); -+fail3: -+ queue_delete(mbc->dst); -+fail2: -+ queue_delete(mbc->src); -+fail1: -+ close(mbc->vfd); -+fail0: -+ free(mbc); -+ request_info(dc, "%s: FAILED\n", __func__); -+ return NULL; -+} -+ -+ -+ -diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h -new file mode 100644 -index 0000000000..2f826cfb14 ---- /dev/null -+++ b/libavcodec/v4l2_req_media.h -@@ -0,0 +1,151 @@ -+/* -+e.h -+* -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the -+ * "Software"), to deal in the Software without restriction, including -+ * without limitation the rights to use, copy, modify, merge, publish, -+ * distribute, sub license, and/or sell copies of the Software, and to -+ * permit persons to whom the Software is furnished to do so, subject to -+ * the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the -+ * next paragraph) shall be included in all copies or substantial portions -+ * of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. -+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR -+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -+ */ -+ -+#ifndef _MEDIA_H_ -+#define _MEDIA_H_ -+ -+#include -+#include -+ -+struct v4l2_format; -+struct v4l2_fmtdesc; -+struct v4l2_query_ext_ctrl; -+ -+struct pollqueue; -+struct media_request; -+struct media_pool; -+ -+typedef enum media_buf_status { -+ MEDIABUFS_STATUS_SUCCESS = 0, -+ MEDIABUFS_ERROR_OPERATION_FAILED, -+ MEDIABUFS_ERROR_DECODING_ERROR, -+ MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE, -+ MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT, -+ MEDIABUFS_ERROR_ALLOCATION_FAILED, -+} MediaBufsStatus; -+ -+struct media_pool * media_pool_new(const char * const media_path, -+ struct pollqueue * const pq, -+ const unsigned int n); -+void media_pool_delete(struct media_pool ** pmp); -+ -+// Obtain a media request -+// Will block if none availible - has a 2sec timeout -+struct media_request * media_request_get(struct media_pool * const mp); -+int media_request_fd(const struct media_request * const req); -+ -+// Start this request -+// Request structure is returned to pool once done -+int media_request_start(struct media_request * const req); -+ -+// Return an *unstarted* media_request to the pool -+// May later be upgraded to allow for aborting a started req -+int media_request_abort(struct media_request ** const preq); -+ -+ -+struct mediabufs_ctl; -+struct qent_src; -+struct qent_dst; -+struct dmabuf_h; -+struct dmabufs_ctl; -+ -+int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp); -+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst); -+ -+// prealloc -+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc); -+// dbsc may be NULL if realloc not required -+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc); -+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane); -+int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane); -+MediaBufsStatus qent_dst_wait(struct qent_dst *const be); -+void qent_dst_delete(struct qent_dst *const be); -+// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead -+void qent_dst_unref(struct qent_dst ** const pbe_dst); -+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst); -+ -+const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no); -+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be); -+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be); -+/* Import an fd unattached to any mediabuf */ -+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst, -+ unsigned int plane, -+ int fd, size_t size); -+ -+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc, -+ struct media_request **const pmreq, -+ struct qent_src **const psrc_be, -+ struct qent_dst *const dst_be, -+ const bool is_final); -+// Get / alloc a dst buffer & associate with a slot -+// If the dst pool is empty then behaviour depends on the fixed flag passed to -+// dst_slots_create. Default is !fixed = unlimited alloc -+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, -+ struct dmabufs_ctl *const dbsc); -+// Create dst slots without alloc -+// If fixed true then qent_alloc will only get slots from this pool and will -+// block until a qent has been unrefed -+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed); -+ -+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc); -+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc); -+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc); -+ -+typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc); -+ -+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc, -+ const unsigned int width, -+ const unsigned int height, -+ mediabufs_dst_fmt_accept_fn *const accept_fn, -+ void *const accept_v); -+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc); -+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src); -+ -+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, -+ struct v4l2_ext_control control_array[], unsigned int n); -+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc, -+ struct media_request * const mreq, -+ unsigned int id, void *data, -+ unsigned int size); -+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n); -+ -+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc); -+ -+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc, -+ enum v4l2_buf_type buf_type, -+ const uint32_t pixfmt, -+ const uint32_t width, const uint32_t height, -+ const size_t bufsize); -+ -+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw, -+ struct dmabufs_ctl * const dbsc, -+ unsigned int n); -+ -+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, -+ const char *vpath, struct pollqueue *const pq); -+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc); -+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc); -+ -+ -+#endif -diff --git a/libavcodec/v4l2_req_pollqueue.c b/libavcodec/v4l2_req_pollqueue.c -new file mode 100644 -index 0000000000..cc8a5d4001 ---- /dev/null -+++ b/libavcodec/v4l2_req_pollqueue.c -@@ -0,0 +1,361 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "v4l2_req_pollqueue.h" -+#include "v4l2_req_utils.h" -+ -+ -+struct pollqueue; -+ -+enum polltask_state { -+ POLLTASK_UNQUEUED = 0, -+ POLLTASK_QUEUED, -+ POLLTASK_RUNNING, -+ POLLTASK_Q_KILL, -+ POLLTASK_RUN_KILL, -+}; -+ -+struct polltask { -+ struct polltask *next; -+ struct polltask *prev; -+ struct pollqueue *q; -+ enum polltask_state state; -+ -+ int fd; -+ short events; -+ -+ void (*fn)(void *v, short revents); -+ void * v; -+ -+ uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */ -+ sem_t kill_sem; -+}; -+ -+struct pollqueue { -+ atomic_int ref_count; -+ pthread_mutex_t lock; -+ -+ struct polltask *head; -+ struct polltask *tail; -+ -+ bool kill; -+ bool no_prod; -+ int prod_fd; -+ struct polltask *prod_pt; -+ pthread_t worker; -+}; -+ -+struct polltask *polltask_new(struct pollqueue *const pq, -+ const int fd, const short events, -+ void (*const fn)(void *v, short revents), -+ void *const v) -+{ -+ struct polltask *pt; -+ -+ if (!events) -+ return NULL; -+ -+ pt = malloc(sizeof(*pt)); -+ if (!pt) -+ return NULL; -+ -+ *pt = (struct polltask){ -+ .next = NULL, -+ .prev = NULL, -+ .q = pollqueue_ref(pq), -+ .fd = fd, -+ .events = events, -+ .fn = fn, -+ .v = v -+ }; -+ -+ sem_init(&pt->kill_sem, 0, 0); -+ -+ return pt; -+} -+ -+static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt) -+{ -+ if (pt->prev) -+ pt->prev->next = pt->next; -+ else -+ pq->head = pt->next; -+ if (pt->next) -+ pt->next->prev = pt->prev; -+ else -+ pq->tail = pt->prev; -+ pt->next = NULL; -+ pt->prev = NULL; -+} -+ -+static void polltask_free(struct polltask * const pt) -+{ -+ sem_destroy(&pt->kill_sem); -+ free(pt); -+} -+ -+static int pollqueue_prod(const struct pollqueue *const pq) -+{ -+ static const uint64_t one = 1; -+ return write(pq->prod_fd, &one, sizeof(one)); -+} -+ -+void polltask_delete(struct polltask **const ppt) -+{ -+ struct polltask *const pt = *ppt; -+ struct pollqueue * pq; -+ enum polltask_state state; -+ bool prodme; -+ -+ if (!pt) -+ return; -+ -+ pq = pt->q; -+ pthread_mutex_lock(&pq->lock); -+ state = pt->state; -+ pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL; -+ prodme = !pq->no_prod; -+ pthread_mutex_unlock(&pq->lock); -+ -+ if (state != POLLTASK_UNQUEUED) { -+ if (prodme) -+ pollqueue_prod(pq); -+ while (sem_wait(&pt->kill_sem) && errno == EINTR) -+ /* loop */; -+ } -+ -+ // Leave zapping the ref until we have DQed the PT as might well be -+ // legitimately used in it -+ *ppt = NULL; -+ polltask_free(pt); -+ pollqueue_unref(&pq); -+} -+ -+static uint64_t pollqueue_now(int timeout) -+{ -+ struct timespec now; -+ uint64_t now_ms; -+ -+ if (clock_gettime(CLOCK_MONOTONIC, &now)) -+ return 0; -+ now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout; -+ return now_ms ? now_ms : (uint64_t)1; -+} -+ -+void pollqueue_add_task(struct polltask *const pt, const int timeout) -+{ -+ bool prodme = false; -+ struct pollqueue * const pq = pt->q; -+ -+ pthread_mutex_lock(&pq->lock); -+ if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) { -+ if (pq->tail) -+ pq->tail->next = pt; -+ else -+ pq->head = pt; -+ pt->prev = pq->tail; -+ pt->next = NULL; -+ pt->state = POLLTASK_QUEUED; -+ pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout); -+ pq->tail = pt; -+ prodme = !pq->no_prod; -+ } -+ pthread_mutex_unlock(&pq->lock); -+ if (prodme) -+ pollqueue_prod(pq); -+} -+ -+static void *poll_thread(void *v) -+{ -+ struct pollqueue *const pq = v; -+ struct pollfd *a = NULL; -+ size_t asize = 0; -+ -+ pthread_mutex_lock(&pq->lock); -+ do { -+ unsigned int i; -+ unsigned int n = 0; -+ struct polltask *pt; -+ struct polltask *pt_next; -+ uint64_t now = pollqueue_now(0); -+ int timeout = -1; -+ int rv; -+ -+ for (pt = pq->head; pt; pt = pt_next) { -+ int64_t t; -+ -+ pt_next = pt->next; -+ -+ if (pt->state == POLLTASK_Q_KILL) { -+ pollqueue_rem_task(pq, pt); -+ sem_post(&pt->kill_sem); -+ continue; -+ } -+ -+ if (n >= asize) { -+ asize = asize ? asize * 2 : 4; -+ a = realloc(a, asize * sizeof(*a)); -+ if (!a) { -+ request_log("Failed to realloc poll array to %zd\n", asize); -+ goto fail_locked; -+ } -+ } -+ -+ a[n++] = (struct pollfd){ -+ .fd = pt->fd, -+ .events = pt->events -+ }; -+ -+ t = (int64_t)(pt->timeout - now); -+ if (pt->timeout && t < INT_MAX && -+ (timeout < 0 || (int)t < timeout)) -+ timeout = (t < 0) ? 0 : (int)t; -+ } -+ pthread_mutex_unlock(&pq->lock); -+ -+ if ((rv = poll(a, n, timeout)) == -1) { -+ if (errno != EINTR) { -+ request_log("Poll error: %s\n", strerror(errno)); -+ goto fail_unlocked; -+ } -+ } -+ -+ pthread_mutex_lock(&pq->lock); -+ now = pollqueue_now(0); -+ -+ /* Prodding in this loop is pointless and might lead to -+ * infinite looping -+ */ -+ pq->no_prod = true; -+ for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) { -+ pt_next = pt->next; -+ -+ /* Pending? */ -+ if (a[i].revents || -+ (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) { -+ pollqueue_rem_task(pq, pt); -+ if (pt->state == POLLTASK_QUEUED) -+ pt->state = POLLTASK_RUNNING; -+ if (pt->state == POLLTASK_Q_KILL) -+ pt->state = POLLTASK_RUN_KILL; -+ pthread_mutex_unlock(&pq->lock); -+ -+ /* This can add new entries to the Q but as -+ * those are added to the tail our existing -+ * chain remains intact -+ */ -+ pt->fn(pt->v, a[i].revents); -+ -+ pthread_mutex_lock(&pq->lock); -+ if (pt->state == POLLTASK_RUNNING) -+ pt->state = POLLTASK_UNQUEUED; -+ if (pt->state == POLLTASK_RUN_KILL) -+ sem_post(&pt->kill_sem); -+ } -+ } -+ pq->no_prod = false; -+ -+ } while (!pq->kill); -+ -+fail_locked: -+ pthread_mutex_unlock(&pq->lock); -+fail_unlocked: -+ free(a); -+ return NULL; -+} -+ -+static void prod_fn(void *v, short revents) -+{ -+ struct pollqueue *const pq = v; -+ char buf[8]; -+ if (revents) -+ read(pq->prod_fd, buf, 8); -+ if (!pq->kill) -+ pollqueue_add_task(pq->prod_pt, -1); -+} -+ -+struct pollqueue * pollqueue_new(void) -+{ -+ struct pollqueue *pq = malloc(sizeof(*pq)); -+ if (!pq) -+ return NULL; -+ *pq = (struct pollqueue){ -+ .ref_count = ATOMIC_VAR_INIT(0), -+ .lock = PTHREAD_MUTEX_INITIALIZER, -+ .head = NULL, -+ .tail = NULL, -+ .kill = false, -+ .prod_fd = -1 -+ }; -+ -+ pq->prod_fd = eventfd(0, EFD_NONBLOCK); -+ if (pq->prod_fd == 1) -+ goto fail1; -+ pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq); -+ if (!pq->prod_pt) -+ goto fail2; -+ pollqueue_add_task(pq->prod_pt, -1); -+ if (pthread_create(&pq->worker, NULL, poll_thread, pq)) -+ goto fail3; -+ // Reset ref count which will have been inced by the add_task -+ atomic_store(&pq->ref_count, 0); -+ return pq; -+ -+fail3: -+ polltask_free(pq->prod_pt); -+fail2: -+ close(pq->prod_fd); -+fail1: -+ free(pq); -+ return NULL; -+} -+ -+static void pollqueue_free(struct pollqueue *const pq) -+{ -+ void *rv; -+ -+ pthread_mutex_lock(&pq->lock); -+ pq->kill = true; -+ pollqueue_prod(pq); -+ pthread_mutex_unlock(&pq->lock); -+ -+ pthread_join(pq->worker, &rv); -+ polltask_free(pq->prod_pt); -+ pthread_mutex_destroy(&pq->lock); -+ close(pq->prod_fd); -+ free(pq); -+} -+ -+struct pollqueue * pollqueue_ref(struct pollqueue *const pq) -+{ -+ atomic_fetch_add(&pq->ref_count, 1); -+ return pq; -+} -+ -+void pollqueue_unref(struct pollqueue **const ppq) -+{ -+ struct pollqueue * const pq = *ppq; -+ -+ if (!pq) -+ return; -+ *ppq = NULL; -+ -+ if (atomic_fetch_sub(&pq->ref_count, 1) != 0) -+ return; -+ -+ pollqueue_free(pq); -+} -+ -+ -+ -diff --git a/libavcodec/v4l2_req_pollqueue.h b/libavcodec/v4l2_req_pollqueue.h -new file mode 100644 -index 0000000000..e1182cb2fc ---- /dev/null -+++ b/libavcodec/v4l2_req_pollqueue.h -@@ -0,0 +1,18 @@ -+#ifndef POLLQUEUE_H_ -+#define POLLQUEUE_H_ -+ -+struct polltask; -+struct pollqueue; -+ -+struct polltask *polltask_new(struct pollqueue *const pq, -+ const int fd, const short events, -+ void (*const fn)(void *v, short revents), -+ void *const v); -+void polltask_delete(struct polltask **const ppt); -+ -+void pollqueue_add_task(struct polltask *const pt, const int timeout); -+struct pollqueue * pollqueue_new(void); -+void pollqueue_unref(struct pollqueue **const ppq); -+struct pollqueue * pollqueue_ref(struct pollqueue *const pq); -+ -+#endif /* POLLQUEUE_H_ */ -diff --git a/libavcodec/v4l2_req_utils.h b/libavcodec/v4l2_req_utils.h -new file mode 100644 -index 0000000000..a31cc1f4ec ---- /dev/null -+++ b/libavcodec/v4l2_req_utils.h -@@ -0,0 +1,27 @@ -+#ifndef AVCODEC_V4L2_REQ_UTILS_H -+#define AVCODEC_V4L2_REQ_UTILS_H -+ -+#include -+#include "libavutil/log.h" -+ -+#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__) -+ -+#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__) -+#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__) -+#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__) -+#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__) -+ -+static inline char safechar(char c) { -+ return c > 0x20 && c < 0x7f ? c : '.'; -+} -+ -+static inline const char * strfourcc(char tbuf[5], uint32_t fcc) { -+ tbuf[0] = safechar((fcc >> 0) & 0xff); -+ tbuf[1] = safechar((fcc >> 8) & 0xff); -+ tbuf[2] = safechar((fcc >> 16) & 0xff); -+ tbuf[3] = safechar((fcc >> 24) & 0xff); -+ tbuf[4] = '\0'; -+ return tbuf; -+} -+ -+#endif -diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c -new file mode 100644 -index 0000000000..b0a5930844 ---- /dev/null -+++ b/libavcodec/v4l2_request_hevc.c -@@ -0,0 +1,297 @@ -+/* -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+ -+ -+#include "decode.h" -+#include "hevcdec.h" -+#include "hwconfig.h" -+#include "internal.h" -+ -+#include "v4l2_request_hevc.h" -+ -+#include "libavutil/hwcontext_drm.h" -+ -+#include "v4l2_req_devscan.h" -+#include "v4l2_req_dmabufs.h" -+#include "v4l2_req_pollqueue.h" -+#include "v4l2_req_media.h" -+#include "v4l2_req_utils.h" -+ -+static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8) -+{ -+ const size_t wxh = w * h; -+ size_t bits_alloc; -+ -+ /* Annex A gives a min compression of 2 @ lvl 3.1 -+ * (wxh <= 983040) and min 4 thereafter but avoid -+ * the odity of 983041 having a lower limit than -+ * 983040. -+ * Multiply by 3/2 for 4:2:0 -+ */ -+ bits_alloc = wxh < 983040 ? wxh * 3 / 4 : -+ wxh < 983040 * 2 ? 983040 * 3 / 4 : -+ wxh * 3 / 8; -+ /* Allow for bit depth */ -+ bits_alloc += (bits_alloc * bits_minus8) / 8; -+ /* Add a few bytes (16k) for overhead */ -+ bits_alloc += 0x4000; -+ return bits_alloc; -+} -+ -+static int v4l2_req_hevc_start_frame(AVCodecContext *avctx, -+ av_unused const uint8_t *buffer, -+ av_unused uint32_t size) -+{ -+ const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ return ctx->fns->start_frame(avctx, buffer, size); -+} -+ -+static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) -+{ -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ return ctx->fns->decode_slice(avctx, buffer, size); -+} -+ -+static int v4l2_req_hevc_end_frame(AVCodecContext *avctx) -+{ -+ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; -+ return ctx->fns->end_frame(avctx); -+} -+ -+static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx) -+{ -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ ctx->fns->abort_frame(avctx); -+} -+ -+static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) -+{ -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ return ctx->fns->frame_params(avctx, hw_frames_ctx); -+} -+ -+static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame) -+{ -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ return ctx->fns->alloc_frame(avctx, frame); -+} -+ -+ -+static int v4l2_request_hevc_uninit(AVCodecContext *avctx) -+{ -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ -+ av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); -+ -+ decode_q_wait(&ctx->decode_q, NULL); // Wait for all other threads to be out of decode -+ -+ mediabufs_ctl_unref(&ctx->mbufs); -+ media_pool_delete(&ctx->mpool); -+ pollqueue_unref(&ctx->pq); -+ dmabufs_ctl_delete(&ctx->dbufs); -+ devscan_delete(&ctx->devscan); -+ -+ decode_q_uninit(&ctx->decode_q); -+ -+// if (avctx->hw_frames_ctx) { -+// AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data; -+// av_buffer_pool_flush(hwfc->pool); -+// } -+ return 0; -+} -+ -+static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc) -+{ -+ AVCodecContext *const avctx = v; -+ const HEVCContext *const h = avctx->priv_data; -+ -+ if (h->ps.sps->bit_depth == 8) { -+ if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 || -+ fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) { -+ return 1; -+ } -+ } -+ else if (h->ps.sps->bit_depth == 10) { -+ if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) { -+ return 1; -+ } -+ } -+ return 0; -+} -+ -+static int v4l2_request_hevc_init(AVCodecContext *avctx) -+{ -+ const HEVCContext *h = avctx->priv_data; -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ const HEVCSPS * const sps = h->ps.sps; -+ int ret; -+ const struct decdev * decdev; -+ const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2; // Assuming constant for all APIs but avoiding V4L2 includes -+ size_t src_size; -+ -+ av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); -+ -+ if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) { -+ av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n"); -+ return (AVERROR(-ret)); -+ } -+ ret = AVERROR(ENOMEM); // Assume mem fail by default for these -+ -+ if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL) -+ { -+ av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n"); -+ ret = AVERROR(ENODEV); -+ goto fail0; -+ } -+ av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n", -+ decdev_media_path(decdev), decdev_video_path(decdev)); -+ -+ if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n"); -+ goto fail0; -+ } -+ -+ if ((ctx->pq = pollqueue_new()) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n"); -+ goto fail1; -+ } -+ -+ if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n"); -+ goto fail2; -+ } -+ -+ if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n"); -+ goto fail3; -+ } -+ -+ // Ask for an initial bitbuf size of max size / 4 -+ // We will realloc if we need more -+ // Must use sps->h/w as avctx contains cropped size -+ src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8); -+ if (mediabufs_src_resizable(ctx->mbufs)) -+ src_size /= 4; -+ // Kludge for conformance tests which break Annex A limits -+ else if (src_size < 0x40000) -+ src_size = 0x40000; -+ -+ if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt, -+ sps->width, sps->height, src_size)) { -+ char tbuf1[5]; -+ av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height); -+ goto fail4; -+ } -+ -+ if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) { -+ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n"); -+ ctx->fns = &V2(ff_v4l2_req_hevc, 2); -+ } -+ else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) { -+ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n"); -+ ctx->fns = &V2(ff_v4l2_req_hevc, 1); -+ } -+ else { -+ av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n"); -+ ret = AVERROR(EINVAL); -+ goto fail4; -+ } -+ -+ if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) { -+ char tbuf1[5]; -+ av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height); -+ goto fail4; -+ } -+ -+ if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n"); -+ goto fail4; -+ } -+ -+ { -+ unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering + -+ avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6); -+ av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots, -+ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering, -+ avctx->thread_count, avctx->extra_hw_frames); -+ -+ // extra_hw_frames is -1 if unset -+ if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n"); -+ goto fail4; -+ } -+ } -+ -+ if (mediabufs_stream_on(ctx->mbufs)) { -+ av_log(avctx, AV_LOG_ERROR, "Failed stream on\n"); -+ goto fail4; -+ } -+ -+ if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n"); -+ goto fail4; -+ } -+ -+ if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed set controls\n"); -+ goto fail5; -+ } -+ -+ decode_q_init(&ctx->decode_q); -+ -+ // Set our s/w format -+ avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format; -+ -+ av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n", -+ ctx->fns->name, -+ decdev_media_path(decdev), decdev_video_path(decdev)); -+ -+ return 0; -+ -+fail5: -+ av_buffer_unref(&avctx->hw_frames_ctx); -+fail4: -+ mediabufs_ctl_unref(&ctx->mbufs); -+fail3: -+ media_pool_delete(&ctx->mpool); -+fail2: -+ pollqueue_unref(&ctx->pq); -+fail1: -+ dmabufs_ctl_delete(&ctx->dbufs); -+fail0: -+ devscan_delete(&ctx->devscan); -+ return ret; -+} -+ -+const AVHWAccel ff_hevc_v4l2request_hwaccel = { -+ .name = "hevc_v4l2request", -+ .type = AVMEDIA_TYPE_VIDEO, -+ .id = AV_CODEC_ID_HEVC, -+ .pix_fmt = AV_PIX_FMT_DRM_PRIME, -+ .alloc_frame = v4l2_req_hevc_alloc_frame, -+ .start_frame = v4l2_req_hevc_start_frame, -+ .decode_slice = v4l2_req_hevc_decode_slice, -+ .end_frame = v4l2_req_hevc_end_frame, -+ .abort_frame = v4l2_req_hevc_abort_frame, -+ .init = v4l2_request_hevc_init, -+ .uninit = v4l2_request_hevc_uninit, -+ .priv_data_size = sizeof(V4L2RequestContextHEVC), -+ .frame_params = v4l2_req_hevc_frame_params, -+ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, -+}; -diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h -new file mode 100644 -index 0000000000..f14f594564 ---- /dev/null -+++ b/libavcodec/v4l2_request_hevc.h -@@ -0,0 +1,102 @@ -+#ifndef AVCODEC_V4L2_REQUEST_HEVC_H -+#define AVCODEC_V4L2_REQUEST_HEVC_H -+ -+#include -+#include -+#include "v4l2_req_decode_q.h" -+ -+#ifndef DRM_FORMAT_NV15 -+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') -+#endif -+ -+#ifndef DRM_FORMAT_NV20 -+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') -+#endif -+ -+// P030 should be defined in drm_fourcc.h and hopefully will be sometime -+// in the future but until then... -+#ifndef DRM_FORMAT_P030 -+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') -+#endif -+ -+#ifndef DRM_FORMAT_NV15 -+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') -+#endif -+ -+#ifndef DRM_FORMAT_NV20 -+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') -+#endif -+ -+#include -+#ifndef V4L2_CID_CODEC_BASE -+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE -+#endif -+ -+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined -+// in drm_fourcc.h hopefully will be sometime in the future but until then... -+#ifndef V4L2_PIX_FMT_NV12_10_COL128 -+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') -+#endif -+ -+#ifndef V4L2_PIX_FMT_NV12_COL128 -+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ -+#endif -+ -+#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY -+#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY 0x0800 -+#endif -+ -+#define MAX_SLICES 128 -+ -+#define VCAT(name, version) name##_v##version -+#define V2(n,v) VCAT(n, v) -+#define V(n) V2(n, HEVC_CTRLS_VERSION) -+ -+#define S2(x) #x -+#define STR(x) S2(x) -+ -+// 1 per decoder -+struct v4l2_req_decode_fns; -+ -+typedef struct V4L2RequestContextHEVC { -+// V4L2RequestContext base; -+ const struct v4l2_req_decode_fns * fns; -+ -+ unsigned int timestamp; // ?? maybe uint64_t -+ -+ int multi_slice; -+ int decode_mode; -+ int start_code; -+ int max_slices; -+ -+ req_decode_q decode_q; -+ -+ struct devscan *devscan; -+ struct dmabufs_ctl *dbufs; -+ struct pollqueue *pq; -+ struct media_pool * mpool; -+ struct mediabufs_ctl *mbufs; -+} V4L2RequestContextHEVC; -+ -+typedef struct v4l2_req_decode_fns { -+ int src_pix_fmt_v4l2; -+ const char * name; -+ -+ // Init setup -+ int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx); -+ int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx); -+ -+ // Passthrough of hwaccel fns -+ int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size); -+ int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size); -+ int (*end_frame)(AVCodecContext *avctx); -+ void (*abort_frame)(AVCodecContext *avctx); -+ int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx); -+ int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame); -+} v4l2_req_decode_fns; -+ -+ -+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1); -+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2); -+ -+#endif - -From c99a0fe4d59212079de9bed222114abf95f7c989 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 27 Apr 2021 19:30:36 +0100 -Subject: [PATCH 013/136] Add no_cvt_hw option to ffmpeg - ---- - fftools/ffmpeg.c | 6 ++++-- - fftools/ffmpeg.h | 2 ++ - fftools/ffmpeg_opt.c | 3 +++ - 3 files changed, 9 insertions(+), 2 deletions(-) - -diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c -index 15e084f0b2..5dc2cd73c1 100644 ---- a/fftools/ffmpeg.c -+++ b/fftools/ffmpeg.c -@@ -2005,6 +2005,9 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame, int keep_ref - (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data)) - need_reinit = 1; - -+ if (no_cvt_hw && fg->graph) -+ need_reinit = 0; -+ - if (sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DISPLAYMATRIX)) { - if (!ifilter->displaymatrix || memcmp(sd->data, ifilter->displaymatrix, sizeof(int32_t) * 9)) - need_reinit = 1; -@@ -2274,8 +2277,7 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_ - decoded_frame->top_field_first = ist->top_field_first; - - ist->frames_decoded++; -- -- if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) { -+ if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) { - err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame); - if (err < 0) - goto fail; -diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h -index f1412f6446..8f478619b3 100644 ---- a/fftools/ffmpeg.h -+++ b/fftools/ffmpeg.h -@@ -729,6 +729,8 @@ extern enum VideoSyncMethod video_sync_method; - extern float frame_drop_threshold; - extern int do_benchmark; - extern int do_benchmark_all; -+extern int no_cvt_hw; -+extern int do_deinterlace; - extern int do_hex_dump; - extern int do_pkt_dump; - extern int copy_ts; -diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c -index 055275d813..761db36588 100644 ---- a/fftools/ffmpeg_opt.c -+++ b/fftools/ffmpeg_opt.c -@@ -71,6 +71,7 @@ enum VideoSyncMethod video_sync_method = VSYNC_AUTO; - float frame_drop_threshold = 0; - int do_benchmark = 0; - int do_benchmark_all = 0; -+int no_cvt_hw = 0; - int do_hex_dump = 0; - int do_pkt_dump = 0; - int copy_ts = 0; -@@ -1427,6 +1428,8 @@ const OptionDef options[] = { - "add timings for benchmarking" }, - { "benchmark_all", OPT_BOOL | OPT_EXPERT, { &do_benchmark_all }, - "add timings for each task" }, -+ { "no_cvt_hw", OPT_BOOL | OPT_EXPERT, { &no_cvt_hw }, -+ "do not auto-convert hw frames to sw" }, - { "progress", HAS_ARG | OPT_EXPERT, { .func_arg = opt_progress }, - "write program-readable progress information", "url" }, - { "stdin", OPT_BOOL | OPT_EXPERT, { &stdin_interaction }, - -From 27e0c78a2df53fb2337bee4c383cdb58cbbc717e Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 28 Apr 2021 10:16:39 +0100 -Subject: [PATCH 014/136] Add vout_drm - ---- - configure | 4 + - libavdevice/Makefile | 1 + - libavdevice/alldevices.c | 1 + - libavdevice/drm_vout.c | 638 +++++++++++++++++++++++++++++++++++++++ - 4 files changed, 644 insertions(+) - create mode 100644 libavdevice/drm_vout.c - -diff --git a/configure b/configure -index 199aa2b3d5..49744cab19 100755 ---- a/configure -+++ b/configure -@@ -346,6 +346,7 @@ External library support: - --enable-libnpp enable Nvidia Performance Primitives-based code [no] - --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no] - --enable-sand enable sand video formats [rpi] -+ --enable-vout-drm enable the vout_drm module - for internal testing only [no] - --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect] - --disable-nvenc disable Nvidia video encoding code [autodetect] - --enable-omx enable OpenMAX IL code [no] -@@ -1940,6 +1941,7 @@ FEATURE_LIST=" - small - static - swscale_alpha -+ vout_drm - " - - # this list should be kept in linking order -@@ -3559,8 +3561,10 @@ sndio_indev_deps="sndio" - sndio_outdev_deps="sndio" - v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h" - v4l2_indev_suggest="libv4l2" -+v4l2_outdev_deps="libdrm" - v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h" - v4l2_outdev_suggest="libv4l2" -+vout_drm_outdev_deps="libdrm vout_drm" - vfwcap_indev_deps="vfw32 vfwcap_defines" - xcbgrab_indev_deps="libxcb" - xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes" -diff --git a/libavdevice/Makefile b/libavdevice/Makefile -index 8a62822b69..36aac30186 100644 ---- a/libavdevice/Makefile -+++ b/libavdevice/Makefile -@@ -48,6 +48,7 @@ OBJS-$(CONFIG_SNDIO_OUTDEV) += sndio_enc.o sndio.o - OBJS-$(CONFIG_V4L2_INDEV) += v4l2.o v4l2-common.o timefilter.o - OBJS-$(CONFIG_V4L2_OUTDEV) += v4l2enc.o v4l2-common.o - OBJS-$(CONFIG_VFWCAP_INDEV) += vfwcap.o -+OBJS-$(CONFIG_VOUT_DRM_OUTDEV) += drm_vout.o - OBJS-$(CONFIG_XCBGRAB_INDEV) += xcbgrab.o - OBJS-$(CONFIG_XV_OUTDEV) += xv.o - -diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c -index 8a90fcb5d7..e2a8669f27 100644 ---- a/libavdevice/alldevices.c -+++ b/libavdevice/alldevices.c -@@ -52,6 +52,7 @@ extern const FFOutputFormat ff_sndio_muxer; - extern const AVInputFormat ff_v4l2_demuxer; - extern const FFOutputFormat ff_v4l2_muxer; - extern const AVInputFormat ff_vfwcap_demuxer; -+extern const FFOutputFormat ff_vout_drm_muxer; - extern const AVInputFormat ff_xcbgrab_demuxer; - extern const FFOutputFormat ff_xv_muxer; - -diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c -new file mode 100644 -index 0000000000..cfb33ce7c3 ---- /dev/null -+++ b/libavdevice/drm_vout.c -@@ -0,0 +1,638 @@ -+/* -+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+ -+// *** This module is a work in progress and its utility is strictly -+// limited to testing. -+ -+#include "libavutil/opt.h" -+#include "libavutil/pixdesc.h" -+#include "libavutil/hwcontext_drm.h" -+#include "libavformat/mux.h" -+#include "avdevice.h" -+ -+#include "pthread.h" -+#include -+#include -+ -+#include -+#include -+ -+#define TRACE_ALL 0 -+ -+#define DRM_MODULE "vc4" -+ -+#define ERRSTR strerror(errno) -+ -+struct drm_setup { -+ int conId; -+ uint32_t crtcId; -+ int crtcIdx; -+ uint32_t planeId; -+ unsigned int out_fourcc; -+ struct { -+ int x, y, width, height; -+ } compose; -+}; -+ -+typedef struct drm_aux_s { -+ unsigned int fb_handle; -+ uint32_t bo_handles[AV_DRM_MAX_PLANES]; -+ AVFrame * frame; -+} drm_aux_t; -+ -+// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS -+// we get initial flicker probably due to dodgy drm timing -+#define AUX_SIZE 3 -+typedef struct drm_display_env_s -+{ -+ AVClass *class; -+ -+ int drm_fd; -+ uint32_t con_id; -+ struct drm_setup setup; -+ enum AVPixelFormat avfmt; -+ int show_all; -+ -+ unsigned int ano; -+ drm_aux_t aux[AUX_SIZE]; -+ -+ pthread_t q_thread; -+ sem_t q_sem_in; -+ sem_t q_sem_out; -+ int q_terminate; -+ AVFrame * q_next; -+ -+} drm_display_env_t; -+ -+ -+static int drm_vout_write_trailer(AVFormatContext *s) -+{ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "%s\n", __func__); -+#endif -+ -+ return 0; -+} -+ -+static int drm_vout_write_header(AVFormatContext *s) -+{ -+ const AVCodecParameters * const par = s->streams[0]->codecpar; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "%s\n", __func__); -+#endif -+ if ( s->nb_streams > 1 -+ || par->codec_type != AVMEDIA_TYPE_VIDEO -+ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { -+ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ return 0; -+} -+ -+static int find_plane(struct AVFormatContext * const avctx, -+ const int drmfd, const int crtcidx, const uint32_t format, -+ uint32_t * const pplane_id) -+{ -+ drmModePlaneResPtr planes; -+ drmModePlanePtr plane; -+ unsigned int i; -+ unsigned int j; -+ int ret = 0; -+ -+ planes = drmModeGetPlaneResources(drmfd); -+ if (!planes) -+ { -+ av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR); -+ return -1; -+ } -+ -+ for (i = 0; i < planes->count_planes; ++i) { -+ plane = drmModeGetPlane(drmfd, planes->planes[i]); -+ if (!planes) -+ { -+ av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR); -+ break; -+ } -+ -+ if (!(plane->possible_crtcs & (1 << crtcidx))) { -+ drmModeFreePlane(plane); -+ continue; -+ } -+ -+ for (j = 0; j < plane->count_formats; ++j) { -+ if (plane->formats[j] == format) -+ break; -+ } -+ -+ if (j == plane->count_formats) { -+ drmModeFreePlane(plane); -+ continue; -+ } -+ -+ *pplane_id = plane->plane_id; -+ drmModeFreePlane(plane); -+ break; -+ } -+ -+ if (i == planes->count_planes) -+ ret = -1; -+ -+ drmModeFreePlaneResources(planes); -+ return ret; -+} -+ -+static void da_uninit(drm_display_env_t * const de, drm_aux_t * da) -+{ -+ if (da->fb_handle != 0) { -+ drmModeRmFB(de->drm_fd, da->fb_handle); -+ da->fb_handle = 0; -+ } -+ -+ for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) { -+ if (da->bo_handles[i]) { -+ struct drm_gem_close gem_close = {.handle = da->bo_handles[i]}; -+ drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close); -+ da->bo_handles[i] = 0; -+ } -+ } -+ av_frame_free(&da->frame); -+} -+ -+static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame) -+{ -+ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0]; -+ drm_aux_t * da = de->aux + de->ano; -+ const uint32_t format = desc->layers[0].format; -+ int ret = 0; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd); -+#endif -+ -+ if (de->setup.out_fourcc != format) { -+ if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) { -+ av_frame_free(&frame); -+ av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format); -+ return -1; -+ } -+ de->setup.out_fourcc = format; -+ } -+ -+ { -+ drmVBlank vbl = { -+ .request = { -+ .type = DRM_VBLANK_RELATIVE, -+ .sequence = 0 -+ } -+ }; -+ -+ while (drmWaitVBlank(de->drm_fd, &vbl)) { -+ if (errno != EINTR) { -+// av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR); -+ break; -+ } -+ } -+ } -+ -+ da_uninit(de, da); -+ -+ { -+ uint32_t pitches[4] = {0}; -+ uint32_t offsets[4] = {0}; -+ uint64_t modifiers[4] = {0}; -+ uint32_t bo_handles[4] = {0}; -+ int i, j, n; -+ -+ da->frame = frame; -+ -+ for (i = 0; i < desc->nb_objects; ++i) { -+ if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) { -+ av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR); -+ return -1; -+ } -+ } -+ -+ n = 0; -+ for (i = 0; i < desc->nb_layers; ++i) { -+ for (j = 0; j < desc->layers[i].nb_planes; ++j) { -+ const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j; -+ const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index; -+ pitches[n] = p->pitch; -+ offsets[n] = p->offset; -+ modifiers[n] = obj->format_modifier; -+ bo_handles[n] = da->bo_handles[p->object_index]; -+ ++n; -+ } -+ } -+ -+#if 1 && TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d," -+ " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n", -+ av_frame_cropped_width(frame), -+ av_frame_cropped_height(frame), -+ desc->layers[0].format, -+ bo_handles[0], -+ bo_handles[1], -+ bo_handles[2], -+ bo_handles[3], -+ pitches[0], -+ pitches[1], -+ pitches[2], -+ pitches[3], -+ offsets[0], -+ offsets[1], -+ offsets[2], -+ offsets[3], -+ (long long)modifiers[0], -+ (long long)modifiers[1], -+ (long long)modifiers[2], -+ (long long)modifiers[3] -+ ); -+#endif -+ -+ if (drmModeAddFB2WithModifiers(de->drm_fd, -+ av_frame_cropped_width(frame), -+ av_frame_cropped_height(frame), -+ desc->layers[0].format, bo_handles, -+ pitches, offsets, modifiers, -+ &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) { -+ av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR); -+ return -1; -+ } -+ } -+ -+ ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId, -+ da->fb_handle, 0, -+ de->setup.compose.x, de->setup.compose.y, -+ de->setup.compose.width, -+ de->setup.compose.height, -+ 0, 0, -+ av_frame_cropped_width(frame) << 16, -+ av_frame_cropped_height(frame) << 16); -+ -+ if (ret != 0) { -+ av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR); -+ } -+ -+ de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1; -+ -+ return ret; -+} -+ -+static int do_sem_wait(sem_t * const sem, const int nowait) -+{ -+ while (nowait ? sem_trywait(sem) : sem_wait(sem)) { -+ if (errno != EINTR) -+ return -errno; -+ } -+ return 0; -+} -+ -+static void * display_thread(void * v) -+{ -+ AVFormatContext * const s = v; -+ drm_display_env_t * const de = s->priv_data; -+ int i; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); -+#endif -+ -+ sem_post(&de->q_sem_out); -+ -+ for (;;) { -+ AVFrame * frame; -+ -+ do_sem_wait(&de->q_sem_in, 0); -+ -+ if (de->q_terminate) -+ break; -+ -+ frame = de->q_next; -+ de->q_next = NULL; -+ sem_post(&de->q_sem_out); -+ -+ do_display(s, de, frame); -+ } -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); -+#endif -+ -+ for (i = 0; i != AUX_SIZE; ++i) -+ da_uninit(de, de->aux + i); -+ -+ av_frame_free(&de->q_next); -+ -+ return NULL; -+} -+ -+static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt) -+{ -+ const AVFrame * const src_frame = (AVFrame *)pkt->data; -+ AVFrame * frame; -+ drm_display_env_t * const de = s->priv_data; -+ int ret; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "%s\n", __func__); -+#endif -+ -+ if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) { -+ av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts); -+ return 0; -+ } -+ -+ if (src_frame->format == AV_PIX_FMT_DRM_PRIME) { -+ frame = av_frame_alloc(); -+ av_frame_ref(frame, src_frame); -+ } -+ else if (src_frame->format == AV_PIX_FMT_VAAPI) { -+ frame = av_frame_alloc(); -+ frame->format = AV_PIX_FMT_DRM_PRIME; -+ if (av_hwframe_map(frame, src_frame, 0) != 0) -+ { -+ av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format); -+ av_frame_free(&frame); -+ return AVERROR(EINVAL); -+ } -+ } -+ else { -+ av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format); -+ return AVERROR(EINVAL); -+ } -+ -+ ret = do_sem_wait(&de->q_sem_out, !de->show_all); -+ if (ret) { -+ av_frame_free(&frame); -+ } -+ else { -+ de->q_next = frame; -+ sem_post(&de->q_sem_in); -+ } -+ -+ return 0; -+} -+ -+static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, -+ unsigned flags) -+{ -+ av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags); -+ return AVERROR_PATCHWELCOME; -+} -+ -+static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size) -+{ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type); -+#endif -+ switch(type) { -+ case AV_APP_TO_DEV_WINDOW_REPAINT: -+ return 0; -+ default: -+ break; -+ } -+ return AVERROR(ENOSYS); -+} -+ -+static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId) -+{ -+ int ret = -1; -+ int i; -+ drmModeRes *res = drmModeGetResources(drmfd); -+ drmModeConnector *c; -+ -+ if(!res) -+ { -+ printf( "drmModeGetResources failed: %s\n", ERRSTR); -+ return -1; -+ } -+ -+ if (res->count_crtcs <= 0) -+ { -+ printf( "drm: no crts\n"); -+ goto fail_res; -+ } -+ -+ if (!s->conId) { -+ fprintf(stderr, -+ "No connector ID specified. Choosing default from list:\n"); -+ -+ for (i = 0; i < res->count_connectors; i++) { -+ drmModeConnector *con = -+ drmModeGetConnector(drmfd, res->connectors[i]); -+ drmModeEncoder *enc = NULL; -+ drmModeCrtc *crtc = NULL; -+ -+ if (con->encoder_id) { -+ enc = drmModeGetEncoder(drmfd, con->encoder_id); -+ if (enc->crtc_id) { -+ crtc = drmModeGetCrtc(drmfd, enc->crtc_id); -+ } -+ } -+ -+ if (!s->conId && crtc) { -+ s->conId = con->connector_id; -+ s->crtcId = crtc->crtc_id; -+ } -+ -+ av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n", -+ con->connector_id, -+ crtc ? crtc->crtc_id : 0, -+ con->connector_type, -+ crtc ? crtc->width : 0, -+ crtc ? crtc->height : 0, -+ (s->conId == (int)con->connector_id ? -+ " (chosen)" : "")); -+ } -+ -+ if (!s->conId) { -+ av_log(avctx, AV_LOG_ERROR, -+ "No suitable enabled connector found.\n"); -+ return -1;; -+ } -+ } -+ -+ s->crtcIdx = -1; -+ -+ for (i = 0; i < res->count_crtcs; ++i) { -+ if (s->crtcId == res->crtcs[i]) { -+ s->crtcIdx = i; -+ break; -+ } -+ } -+ -+ if (s->crtcIdx == -1) -+ { -+ av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId); -+ goto fail_res; -+ } -+ -+ if (res->count_connectors <= 0) -+ { -+ av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n"); -+ goto fail_res; -+ } -+ -+ c = drmModeGetConnector(drmfd, s->conId); -+ if (!c) -+ { -+ av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR); -+ goto fail_res; -+ } -+ -+ if (!c->count_modes) -+ { -+ av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n"); -+ goto fail_conn; -+ } -+ -+ { -+ drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId); -+ s->compose.x = crtc->x; -+ s->compose.y = crtc->y; -+ s->compose.width = crtc->width; -+ s->compose.height = crtc->height; -+ drmModeFreeCrtc(crtc); -+ } -+ -+ if (pConId) -+ *pConId = c->connector_id; -+ ret = 0; -+ -+fail_conn: -+ drmModeFreeConnector(c); -+ -+fail_res: -+ drmModeFreeResources(res); -+ -+ return ret; -+} -+ -+// deinit is called if init fails so no need to clean up explicity here -+static int drm_vout_init(struct AVFormatContext * s) -+{ -+ drm_display_env_t * const de = s->priv_data; -+ int rv; -+ const char * drm_module = DRM_MODULE; -+ -+ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); -+ -+ de->drm_fd = -1; -+ de->con_id = 0; -+ de->setup = (struct drm_setup){0}; -+ de->q_terminate = 0; -+ -+ if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0) -+ { -+ rv = AVERROR(errno); -+ av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv)); -+ return rv; -+ } -+ -+ if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0) -+ { -+ av_log(s, AV_LOG_ERROR, "failed to find valid mode\n"); -+ rv = AVERROR(EINVAL); -+ goto fail_close; -+ } -+ -+ sem_init(&de->q_sem_in, 0, 0); -+ sem_init(&de->q_sem_out, 0, 0); -+ if (pthread_create(&de->q_thread, NULL, display_thread, s)) { -+ rv = AVERROR(errno); -+ av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv)); -+ goto fail_close; -+ } -+ -+ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); -+ -+ return 0; -+ -+fail_close: -+ close(de->drm_fd); -+ de->drm_fd = -1; -+ av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__); -+ -+ return rv; -+} -+ -+static void drm_vout_deinit(struct AVFormatContext * s) -+{ -+ drm_display_env_t * const de = s->priv_data; -+ -+ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); -+ -+ de->q_terminate = 1; -+ sem_post(&de->q_sem_in); -+ pthread_join(de->q_thread, NULL); -+ sem_destroy(&de->q_sem_in); -+ sem_destroy(&de->q_sem_out); -+ -+ for (unsigned int i = 0; i != AUX_SIZE; ++i) -+ da_uninit(de, de->aux + i); -+ -+ av_frame_free(&de->q_next); -+ -+ if (de->drm_fd >= 0) { -+ close(de->drm_fd); -+ de->drm_fd = -1; -+ } -+ -+ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); -+} -+ -+ -+#define OFFSET(x) offsetof(drm_display_env_t, x) -+static const AVOption options[] = { -+ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, -+ { NULL } -+}; -+ -+static const AVClass drm_vout_class = { -+ .class_name = "drm vid outdev", -+ .item_name = av_default_item_name, -+ .option = options, -+ .version = LIBAVUTIL_VERSION_INT, -+ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, -+}; -+ -+FFOutputFormat ff_vout_drm_muxer = { -+ .p = { -+ .name = "vout_drm", -+ .long_name = NULL_IF_CONFIG_SMALL("Drm video output device"), -+ .audio_codec = AV_CODEC_ID_NONE, -+ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, -+ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, -+ .priv_class = &drm_vout_class, -+ }, -+ .priv_data_size = sizeof(drm_display_env_t), -+ .write_header = drm_vout_write_header, -+ .write_packet = drm_vout_write_packet, -+ .write_uncoded_frame = drm_vout_write_frame, -+ .write_trailer = drm_vout_write_trailer, -+ .control_message = drm_vout_control_message, -+ .init = drm_vout_init, -+ .deinit = drm_vout_deinit, -+}; -+ - -From cc536672adf4eefeaec16e9808f583c693ad7819 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 28 Apr 2021 11:34:18 +0100 -Subject: [PATCH 015/136] Add vout_egl - ---- - configure | 6 + - libavdevice/Makefile | 1 + - libavdevice/alldevices.c | 1 + - libavdevice/egl_vout.c | 811 +++++++++++++++++++++++++++++++++++++++ - 4 files changed, 819 insertions(+) - create mode 100644 libavdevice/egl_vout.c - -diff --git a/configure b/configure -index 49744cab19..b41663c794 100755 ---- a/configure -+++ b/configure -@@ -347,6 +347,7 @@ External library support: - --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no] - --enable-sand enable sand video formats [rpi] - --enable-vout-drm enable the vout_drm module - for internal testing only [no] -+ --enable-vout-egl enable the vout_egl module - for internal testing only [no] - --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect] - --disable-nvenc disable Nvidia video encoding code [autodetect] - --enable-omx enable OpenMAX IL code [no] -@@ -1818,6 +1819,7 @@ EXTERNAL_LIBRARY_LIST=" - libdav1d - libdc1394 - libdrm -+ epoxy - libflite - libfontconfig - libfreetype -@@ -1942,6 +1944,7 @@ FEATURE_LIST=" - static - swscale_alpha - vout_drm -+ vout_egl - " - - # this list should be kept in linking order -@@ -3565,6 +3568,8 @@ v4l2_outdev_deps="libdrm" - v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h" - v4l2_outdev_suggest="libv4l2" - vout_drm_outdev_deps="libdrm vout_drm" -+vout_egl_outdev_deps="xlib" -+vout_egl_outdev_select="epoxy" - vfwcap_indev_deps="vfw32 vfwcap_defines" - xcbgrab_indev_deps="libxcb" - xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes" -@@ -6596,6 +6601,7 @@ enabled libdav1d && require_pkg_config libdav1d "dav1d >= 0.5.0" "dav1d - enabled libdavs2 && require_pkg_config libdavs2 "davs2 >= 1.6.0" davs2.h davs2_decoder_open - enabled libdc1394 && require_pkg_config libdc1394 libdc1394-2 dc1394/dc1394.h dc1394_new - enabled libdrm && require_pkg_config libdrm libdrm xf86drm.h drmGetVersion -+enabled epoxy && require_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version - enabled libfdk_aac && { check_pkg_config libfdk_aac fdk-aac "fdk-aac/aacenc_lib.h" aacEncOpen || - { require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-aac && - warn "using libfdk without pkg-config"; } } -diff --git a/libavdevice/Makefile b/libavdevice/Makefile -index 36aac30186..0989cb895f 100644 ---- a/libavdevice/Makefile -+++ b/libavdevice/Makefile -@@ -49,6 +49,7 @@ OBJS-$(CONFIG_V4L2_INDEV) += v4l2.o v4l2-common.o timefilter.o - OBJS-$(CONFIG_V4L2_OUTDEV) += v4l2enc.o v4l2-common.o - OBJS-$(CONFIG_VFWCAP_INDEV) += vfwcap.o - OBJS-$(CONFIG_VOUT_DRM_OUTDEV) += drm_vout.o -+OBJS-$(CONFIG_VOUT_EGL_OUTDEV) += egl_vout.o - OBJS-$(CONFIG_XCBGRAB_INDEV) += xcbgrab.o - OBJS-$(CONFIG_XV_OUTDEV) += xv.o - -diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c -index e2a8669f27..ffb410b92d 100644 ---- a/libavdevice/alldevices.c -+++ b/libavdevice/alldevices.c -@@ -53,6 +53,7 @@ extern const AVInputFormat ff_v4l2_demuxer; - extern const FFOutputFormat ff_v4l2_muxer; - extern const AVInputFormat ff_vfwcap_demuxer; - extern const FFOutputFormat ff_vout_drm_muxer; -+extern const FFOutputFormat ff_vout_egl_muxer; - extern const AVInputFormat ff_xcbgrab_demuxer; - extern const FFOutputFormat ff_xv_muxer; - -diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c -new file mode 100644 -index 0000000000..7b9c610ace ---- /dev/null -+++ b/libavdevice/egl_vout.c -@@ -0,0 +1,811 @@ -+/* -+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+ -+// *** This module is a work in progress and its utility is strictly -+// limited to testing. -+// Amongst other issues it doesn't wait for the pic to be displayed before -+// returning the buffer so flikering does occur. -+ -+#include -+#include -+ -+#include "libavutil/opt.h" -+#include "libavutil/avassert.h" -+#include "libavutil/pixdesc.h" -+#include "libavutil/imgutils.h" -+#include "libavutil/hwcontext_drm.h" -+#include "libavformat/mux.h" -+#include "avdevice.h" -+ -+#include "pthread.h" -+#include -+#include -+#include -+ -+#include -+#include -+ -+#include "libavutil/rpi_sand_fns.h" -+ -+#define TRACE_ALL 0 -+ -+struct egl_setup { -+ int conId; -+ -+ Display *dpy; -+ EGLDisplay egl_dpy; -+ EGLContext ctx; -+ EGLSurface surf; -+ Window win; -+ -+ uint32_t crtcId; -+ int crtcIdx; -+ uint32_t planeId; -+ struct { -+ int x, y, width, height; -+ } compose; -+}; -+ -+typedef struct egl_aux_s { -+ int fd; -+ GLuint texture; -+ -+} egl_aux_t; -+ -+typedef struct egl_display_env_s -+{ -+ AVClass *class; -+ -+ struct egl_setup setup; -+ enum AVPixelFormat avfmt; -+ -+ int show_all; -+ int window_width, window_height; -+ int window_x, window_y; -+ int fullscreen; -+ -+ egl_aux_t aux[32]; -+ -+ pthread_t q_thread; -+ pthread_mutex_t q_lock; -+ sem_t display_start_sem; -+ sem_t q_sem; -+ int q_terminate; -+ AVFrame * q_this; -+ AVFrame * q_next; -+ -+} egl_display_env_t; -+ -+ -+/** -+ * Remove window border/decorations. -+ */ -+static void -+no_border( Display *dpy, Window w) -+{ -+ static const unsigned MWM_HINTS_DECORATIONS = (1 << 1); -+ static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5; -+ -+ typedef struct -+ { -+ unsigned long flags; -+ unsigned long functions; -+ unsigned long decorations; -+ long inputMode; -+ unsigned long status; -+ } PropMotifWmHints; -+ -+ PropMotifWmHints motif_hints; -+ Atom prop, proptype; -+ unsigned long flags = 0; -+ -+ /* setup the property */ -+ motif_hints.flags = MWM_HINTS_DECORATIONS; -+ motif_hints.decorations = flags; -+ -+ /* get the atom for the property */ -+ prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True ); -+ if (!prop) { -+ /* something went wrong! */ -+ return; -+ } -+ -+ /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */ -+ proptype = prop; -+ -+ XChangeProperty( dpy, w, /* display, window */ -+ prop, proptype, /* property, type */ -+ 32, /* format: 32-bit datums */ -+ PropModeReplace, /* mode */ -+ (unsigned char *) &motif_hints, /* data */ -+ PROP_MOTIF_WM_HINTS_ELEMENTS /* nelements */ -+ ); -+} -+ -+ -+/* -+ * Create an RGB, double-buffered window. -+ * Return the window and context handles. -+ */ -+static int -+make_window(struct AVFormatContext * const s, -+ egl_display_env_t * const de, -+ Display *dpy, EGLDisplay egl_dpy, const char *name, -+ Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet) -+{ -+ int scrnum = DefaultScreen( dpy ); -+ XSetWindowAttributes attr; -+ unsigned long mask; -+ Window root = RootWindow( dpy, scrnum ); -+ Window win; -+ EGLContext ctx; -+ const int fullscreen = de->fullscreen; -+ EGLConfig config; -+ int x = de->window_x; -+ int y = de->window_y; -+ int width = de->window_width ? de->window_width : 1280; -+ int height = de->window_height ? de->window_height : 720; -+ -+ -+ if (fullscreen) { -+ int scrnum = DefaultScreen(dpy); -+ -+ x = 0; y = 0; -+ width = DisplayWidth(dpy, scrnum); -+ height = DisplayHeight(dpy, scrnum); -+ } -+ -+ { -+ EGLint num_configs; -+ static const EGLint attribs[] = { -+ EGL_RED_SIZE, 1, -+ EGL_GREEN_SIZE, 1, -+ EGL_BLUE_SIZE, 1, -+ EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, -+ EGL_NONE -+ }; -+ -+ if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) { -+ av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n"); -+ return -1; -+ } -+ } -+ -+ { -+ EGLint vid; -+ if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) { -+ av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n"); -+ return -1; -+ } -+ -+ { -+ XVisualInfo visTemplate = { -+ .visualid = vid, -+ }; -+ int num_visuals; -+ XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask, -+ &visTemplate, &num_visuals); -+ -+ /* window attributes */ -+ attr.background_pixel = 0; -+ attr.border_pixel = 0; -+ attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone); -+ attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask; -+ /* XXX this is a bad way to get a borderless window! */ -+ mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask; -+ -+ win = XCreateWindow( dpy, root, x, y, width, height, -+ 0, visinfo->depth, InputOutput, -+ visinfo->visual, mask, &attr ); -+ XFree(visinfo); -+ } -+ } -+ -+ if (fullscreen) -+ no_border(dpy, win); -+ -+ /* set hints and properties */ -+ { -+ XSizeHints sizehints; -+ sizehints.x = x; -+ sizehints.y = y; -+ sizehints.width = width; -+ sizehints.height = height; -+ sizehints.flags = USSize | USPosition; -+ XSetNormalHints(dpy, win, &sizehints); -+ XSetStandardProperties(dpy, win, name, name, -+ None, (char **)NULL, 0, &sizehints); -+ } -+ -+ eglBindAPI(EGL_OPENGL_ES_API); -+ -+ { -+ static const EGLint ctx_attribs[] = { -+ EGL_CONTEXT_CLIENT_VERSION, 2, -+ EGL_NONE -+ }; -+ ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs ); -+ if (!ctx) { -+ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); -+ return -1; -+ } -+ } -+ -+ -+ XMapWindow(dpy, win); -+ -+ { -+ EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL); -+ if (!surf) { -+ av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n"); -+ return -1; -+ } -+ -+ if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) { -+ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); -+ return -1; -+ } -+ -+ *winRet = win; -+ *ctxRet = ctx; -+ *surfRet = surf; -+ } -+ -+ return 0; -+} -+ -+static GLint -+compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source) -+{ -+ GLuint s = glCreateShader(target); -+ -+ if (s == 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n"); -+ return 0; -+ } -+ -+ glShaderSource(s, 1, (const GLchar **) &source, NULL); -+ glCompileShader(s); -+ -+ { -+ GLint ok; -+ glGetShaderiv(s, GL_COMPILE_STATUS, &ok); -+ -+ if (!ok) { -+ GLchar *info; -+ GLint size; -+ -+ glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size); -+ info = malloc(size); -+ -+ glGetShaderInfoLog(s, size, NULL, info); -+ av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source); -+ -+ return 0; -+ } -+ } -+ -+ return s; -+} -+ -+static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs) -+{ -+ GLuint prog = glCreateProgram(); -+ -+ if (prog == 0) { -+ av_log(s, AV_LOG_ERROR, "Failed to create program\n"); -+ return 0; -+ } -+ -+ glAttachShader(prog, vs); -+ glAttachShader(prog, fs); -+ glLinkProgram(prog); -+ -+ { -+ GLint ok; -+ glGetProgramiv(prog, GL_LINK_STATUS, &ok); -+ if (!ok) { -+ /* Some drivers return a size of 1 for an empty log. This is the size -+ * of a log that contains only a terminating NUL character. -+ */ -+ GLint size; -+ GLchar *info = NULL; -+ glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size); -+ if (size > 1) { -+ info = malloc(size); -+ glGetProgramInfoLog(prog, size, NULL, info); -+ } -+ -+ av_log(s, AV_LOG_ERROR, "Failed to link: %s\n", -+ (info != NULL) ? info : ""); -+ return 0; -+ } -+ } -+ -+ return prog; -+} -+ -+static int -+gl_setup(struct AVFormatContext * const s) -+{ -+ const char *vs = -+ "attribute vec4 pos;\n" -+ "varying vec2 texcoord;\n" -+ "\n" -+ "void main() {\n" -+ " gl_Position = pos;\n" -+ " texcoord.x = (pos.x + 1.0) / 2.0;\n" -+ " texcoord.y = (-pos.y + 1.0) / 2.0;\n" -+ "}\n"; -+ const char *fs = -+ "#extension GL_OES_EGL_image_external : enable\n" -+ "precision mediump float;\n" -+ "uniform samplerExternalOES s;\n" -+ "varying vec2 texcoord;\n" -+ "void main() {\n" -+ " gl_FragColor = texture2D(s, texcoord);\n" -+ "}\n"; -+ -+ GLuint vs_s; -+ GLuint fs_s; -+ GLuint prog; -+ -+ if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) || -+ !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) || -+ !(prog = link_program(s, vs_s, fs_s))) -+ return -1; -+ -+ glUseProgram(prog); -+ -+ { -+ static const float verts[] = { -+ -1, -1, -+ 1, -1, -+ 1, 1, -+ -1, 1, -+ }; -+ glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts); -+ } -+ -+ glEnableVertexAttribArray(0); -+ return 0; -+} -+ -+static int egl_vout_write_trailer(AVFormatContext *s) -+{ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "%s\n", __func__); -+#endif -+ -+ return 0; -+} -+ -+static int egl_vout_write_header(AVFormatContext *s) -+{ -+ const AVCodecParameters * const par = s->streams[0]->codecpar; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "%s\n", __func__); -+#endif -+ if ( s->nb_streams > 1 -+ || par->codec_type != AVMEDIA_TYPE_VIDEO -+ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { -+ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ return 0; -+} -+ -+ -+static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame) -+{ -+ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0]; -+ egl_aux_t * da = NULL; -+ unsigned int i; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); -+#endif -+ -+ for (i = 0; i != 32; ++i) { -+ if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) { -+ da = de->aux + i; -+ break; -+ } -+ } -+ -+ if (da == NULL) { -+ av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__); -+ return AVERROR(EINVAL); -+ } -+ -+ if (da->texture == 0) { -+ EGLint attribs[50]; -+ EGLint * a = attribs; -+ int i, j; -+ static const EGLint anames[] = { -+ EGL_DMA_BUF_PLANE0_FD_EXT, -+ EGL_DMA_BUF_PLANE0_OFFSET_EXT, -+ EGL_DMA_BUF_PLANE0_PITCH_EXT, -+ EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT, -+ EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT, -+ EGL_DMA_BUF_PLANE1_FD_EXT, -+ EGL_DMA_BUF_PLANE1_OFFSET_EXT, -+ EGL_DMA_BUF_PLANE1_PITCH_EXT, -+ EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT, -+ EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT, -+ EGL_DMA_BUF_PLANE2_FD_EXT, -+ EGL_DMA_BUF_PLANE2_OFFSET_EXT, -+ EGL_DMA_BUF_PLANE2_PITCH_EXT, -+ EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT, -+ EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT, -+ }; -+ const EGLint * b = anames; -+ -+ *a++ = EGL_WIDTH; -+ *a++ = av_frame_cropped_width(frame); -+ *a++ = EGL_HEIGHT; -+ *a++ = av_frame_cropped_height(frame); -+ *a++ = EGL_LINUX_DRM_FOURCC_EXT; -+ *a++ = desc->layers[0].format; -+ -+ for (i = 0; i < desc->nb_layers; ++i) { -+ for (j = 0; j < desc->layers[i].nb_planes; ++j) { -+ const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j; -+ const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index; -+ *a++ = *b++; -+ *a++ = obj->fd; -+ *a++ = *b++; -+ *a++ = p->offset; -+ *a++ = *b++; -+ *a++ = p->pitch; -+ if (obj->format_modifier == 0) { -+ b += 2; -+ } -+ else { -+ *a++ = *b++; -+ *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF); -+ *a++ = *b++; -+ *a++ = (EGLint)(obj->format_modifier >> 32); -+ } -+ } -+ } -+ -+ *a = EGL_NONE; -+ -+#if TRACE_ALL -+ for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) { -+ av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]); -+ } -+#endif -+ { -+ const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy, -+ EGL_NO_CONTEXT, -+ EGL_LINUX_DMA_BUF_EXT, -+ NULL, attribs); -+ if (!image) { -+ av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd); -+ return -1; -+ } -+ -+ glGenTextures(1, &da->texture); -+ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); -+ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR); -+ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR); -+ glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image); -+ -+ eglDestroyImageKHR(de->setup.egl_dpy, image); -+ } -+ -+ da->fd = desc->objects[0].fd; -+ -+#if 0 -+ av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d," -+ " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n", -+ av_frame_cropped_width(frame), -+ av_frame_cropped_height(frame), -+ desc->layers[0].format, -+ bo_plane_handles[0], -+ bo_plane_handles[1], -+ bo_plane_handles[2], -+ bo_plane_handles[3], -+ pitches[0], -+ pitches[1], -+ pitches[2], -+ pitches[3], -+ offsets[0], -+ offsets[1], -+ offsets[2], -+ offsets[3], -+ (long long)modifiers[0], -+ (long long)modifiers[1], -+ (long long)modifiers[2], -+ (long long)modifiers[3] -+ ); -+#endif -+ } -+ -+ glClearColor(0.5, 0.5, 0.5, 0.5); -+ glClear(GL_COLOR_BUFFER_BIT); -+ -+ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); -+ glDrawArrays(GL_TRIANGLE_FAN, 0, 4); -+ eglSwapBuffers(de->setup.egl_dpy, de->setup.surf); -+ -+ glDeleteTextures(1, &da->texture); -+ da->texture = 0; -+ da->fd = -1; -+ -+ return 0; -+} -+ -+static void * display_thread(void * v) -+{ -+ AVFormatContext * const s = v; -+ egl_display_env_t * const de = s->priv_data; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); -+#endif -+ { -+ EGLint egl_major, egl_minor; -+ -+ de->setup.dpy = XOpenDisplay(NULL); -+ if (!de->setup.dpy) { -+ av_log(s, AV_LOG_ERROR, "Couldn't open X display\n"); -+ goto fail; -+ } -+ -+ de->setup.egl_dpy = eglGetDisplay(de->setup.dpy); -+ if (!de->setup.egl_dpy) { -+ av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n"); -+ goto fail; -+ } -+ -+ if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) { -+ av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n"); -+ goto fail; -+ } -+ -+ av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor); -+ -+ if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) { -+ av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n"); -+ goto fail; -+ } -+ } -+ -+ if (!de->window_width || !de->window_height) { -+ de->window_width = 1280; -+ de->window_height = 720; -+ } -+ if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout", -+ &de->setup.win, &de->setup.ctx, &de->setup.surf)) { -+ av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__); -+ goto fail; -+ } -+ -+ if (gl_setup(s)) { -+ av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__); -+ goto fail; -+ } -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__); -+#endif -+ sem_post(&de->display_start_sem); -+ -+ for (;;) { -+ AVFrame * frame; -+ -+ while (sem_wait(&de->q_sem) != 0) { -+ av_assert0(errno == EINTR); -+ } -+ -+ if (de->q_terminate) -+ break; -+ -+ pthread_mutex_lock(&de->q_lock); -+ frame = de->q_next; -+ de->q_next = NULL; -+ pthread_mutex_unlock(&de->q_lock); -+ -+ do_display(s, de, frame); -+ -+ av_frame_free(&de->q_this); -+ de->q_this = frame; -+ } -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, ">>> %s\n", __func__); -+#endif -+ -+ return NULL; -+ -+fail: -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__); -+#endif -+ de->q_terminate = 1; -+ sem_post(&de->display_start_sem); -+ -+ return NULL; -+} -+ -+static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt) -+{ -+ const AVFrame * const src_frame = (AVFrame *)pkt->data; -+ AVFrame * frame; -+ egl_display_env_t * const de = s->priv_data; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "%s\n", __func__); -+#endif -+ -+ if (src_frame->format == AV_PIX_FMT_DRM_PRIME) { -+ frame = av_frame_alloc(); -+ av_frame_ref(frame, src_frame); -+ } -+ else if (src_frame->format == AV_PIX_FMT_VAAPI) { -+ frame = av_frame_alloc(); -+ frame->format = AV_PIX_FMT_DRM_PRIME; -+ if (av_hwframe_map(frame, src_frame, 0) != 0) -+ { -+ av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format); -+ av_frame_free(&frame); -+ return AVERROR(EINVAL); -+ } -+ } -+ else { -+ av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format); -+ return AVERROR(EINVAL); -+ } -+ -+ // Really hacky sync -+ while (de->show_all && de->q_next) { -+ usleep(3000); -+ } -+ -+ pthread_mutex_lock(&de->q_lock); -+ { -+ AVFrame * const t = de->q_next; -+ de->q_next = frame; -+ frame = t; -+ } -+ pthread_mutex_unlock(&de->q_lock); -+ -+ if (frame == NULL) -+ sem_post(&de->q_sem); -+ else -+ av_frame_free(&frame); -+ -+ return 0; -+} -+ -+static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, -+ unsigned flags) -+{ -+ av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags); -+ return AVERROR_PATCHWELCOME; -+} -+ -+static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size) -+{ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type); -+#endif -+ switch(type) { -+ case AV_APP_TO_DEV_WINDOW_REPAINT: -+ return 0; -+ default: -+ break; -+ } -+ return AVERROR(ENOSYS); -+} -+ -+// deinit is called if init fails so no need to clean up explicity here -+static int egl_vout_init(struct AVFormatContext * s) -+{ -+ egl_display_env_t * const de = s->priv_data; -+ unsigned int i; -+ -+ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); -+ -+ de->setup = (struct egl_setup){0}; -+ -+ for (i = 0; i != 32; ++i) { -+ de->aux[i].fd = -1; -+ } -+ -+ de->q_terminate = 0; -+ pthread_mutex_init(&de->q_lock, NULL); -+ sem_init(&de->q_sem, 0, 0); -+ sem_init(&de->display_start_sem, 0, 0); -+ av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0); -+ -+ sem_wait(&de->display_start_sem); -+ if (de->q_terminate) { -+ av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__); -+ return -1; -+ } -+ -+ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); -+ -+ return 0; -+} -+ -+static void egl_vout_deinit(struct AVFormatContext * s) -+{ -+ egl_display_env_t * const de = s->priv_data; -+ -+ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); -+ -+ de->q_terminate = 1; -+ sem_post(&de->q_sem); -+ pthread_join(de->q_thread, NULL); -+ sem_destroy(&de->q_sem); -+ pthread_mutex_destroy(&de->q_lock); -+ -+ av_frame_free(&de->q_next); -+ av_frame_free(&de->q_this); -+ -+ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); -+} -+ -+#define OFFSET(x) offsetof(egl_display_env_t, x) -+static const AVOption options[] = { -+ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, -+ { NULL } -+ -+}; -+ -+static const AVClass egl_vout_class = { -+ .class_name = "egl vid outdev", -+ .item_name = av_default_item_name, -+ .option = options, -+ .version = LIBAVUTIL_VERSION_INT, -+ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, -+}; -+ -+FFOutputFormat ff_vout_egl_muxer = { -+ .p = { -+ .name = "vout_egl", -+ .long_name = NULL_IF_CONFIG_SMALL("Egl video output device"), -+ .audio_codec = AV_CODEC_ID_NONE, -+ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, -+ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, -+ .priv_class = &egl_vout_class, -+ }, -+ .priv_data_size = sizeof(egl_display_env_t), -+ .write_header = egl_vout_write_header, -+ .write_packet = egl_vout_write_packet, -+ .write_uncoded_frame = egl_vout_write_frame, -+ .write_trailer = egl_vout_write_trailer, -+ .control_message = egl_vout_control_message, -+ .init = egl_vout_init, -+ .deinit = egl_vout_deinit, -+}; -+ - -From 867bd7c243e66a1c1756878e20df8f35db8025ec Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 28 Apr 2021 12:51:22 +0100 -Subject: [PATCH 016/136] V4L2 stateful rework - ---- - libavcodec/Makefile | 3 +- - libavcodec/v4l2_buffers.c | 556 +++++++++++++++++++++++++++----------- - libavcodec/v4l2_buffers.h | 28 +- - libavcodec/v4l2_context.c | 536 +++++++++++++++++++++++++++--------- - libavcodec/v4l2_context.h | 20 +- - libavcodec/v4l2_m2m.c | 20 +- - libavcodec/v4l2_m2m.h | 31 +++ - libavcodec/v4l2_m2m_dec.c | 446 ++++++++++++++++++++++++++---- - 8 files changed, 1286 insertions(+), 354 deletions(-) - -diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index 2d440b5648..e1aa0ba014 100644 ---- a/libavcodec/Makefile -+++ b/libavcodec/Makefile -@@ -169,7 +169,8 @@ OBJS-$(CONFIG_VIDEODSP) += videodsp.o - OBJS-$(CONFIG_VP3DSP) += vp3dsp.o - OBJS-$(CONFIG_VP56DSP) += vp56dsp.o - OBJS-$(CONFIG_VP8DSP) += vp8dsp.o --OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o -+OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\ -+ weak_link.o - OBJS-$(CONFIG_V4L2_REQUEST) += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\ - v4l2_req_devscan.o weak_link.o - OBJS-$(CONFIG_WMA_FREQS) += wma_freqs.o -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 3f5471067a..a003934ca1 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -21,6 +21,7 @@ - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -+#include - #include - #include - #include -@@ -29,12 +30,14 @@ - #include - #include "libavcodec/avcodec.h" - #include "libavutil/pixdesc.h" -+#include "libavutil/hwcontext.h" - #include "v4l2_context.h" - #include "v4l2_buffers.h" - #include "v4l2_m2m.h" -+#include "weak_link.h" - - #define USEC_PER_SEC 1000000 --static AVRational v4l2_timebase = { 1, USEC_PER_SEC }; -+static const AVRational v4l2_timebase = { 1, USEC_PER_SEC }; - - static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf) - { -@@ -51,34 +54,44 @@ static inline AVCodecContext *logger(V4L2Buffer *buf) - static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf) - { - V4L2m2mContext *s = buf_to_m2mctx(avbuf); -- -- if (s->avctx->pkt_timebase.num) -- return s->avctx->pkt_timebase; -- return s->avctx->time_base; -+ const AVRational tb = s->avctx->pkt_timebase.num ? -+ s->avctx->pkt_timebase : -+ s->avctx->time_base; -+ return tb.num && tb.den ? tb : v4l2_timebase; - } - --static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts) -+static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts, int no_rescale) - { -- int64_t v4l2_pts; -- -- if (pts == AV_NOPTS_VALUE) -- pts = 0; -- - /* convert pts to v4l2 timebase */ -- v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); -+ const int64_t v4l2_pts = -+ no_rescale ? pts : -+ pts == AV_NOPTS_VALUE ? 0 : -+ av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); - out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC; - out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC; - } - --static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf) -+static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf, int no_rescale) - { -- int64_t v4l2_pts; -- - /* convert pts back to encoder timebase */ -- v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC + -+ const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC + - avbuf->buf.timestamp.tv_usec; - -- return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); -+ return -+ no_rescale ? v4l2_pts : -+ v4l2_pts == 0 ? AV_NOPTS_VALUE : -+ av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); -+} -+ -+static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length) -+{ -+ if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { -+ out->planes[plane].bytesused = bytesused; -+ out->planes[plane].length = length; -+ } else { -+ out->buf.bytesused = bytesused; -+ out->buf.length = length; -+ } - } - - static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf) -@@ -209,68 +222,143 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf) - return AVCOL_TRC_UNSPECIFIED; - } - --static void v4l2_free_buffer(void *opaque, uint8_t *unused) -+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf) - { -- V4L2Buffer* avbuf = opaque; -- V4L2m2mContext *s = buf_to_m2mctx(avbuf); -+ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; -+ AVDRMLayerDescriptor *layer; - -- if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) { -- atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel); -+ /* fill the DRM frame descriptor */ -+ drm_desc->nb_objects = avbuf->num_planes; -+ drm_desc->nb_layers = 1; - -- if (s->reinit) { -- if (!atomic_load(&s->refcount)) -- sem_post(&s->refsync); -- } else { -- if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) { -- /* no need to queue more buffers to the driver */ -- avbuf->status = V4L2BUF_AVAILABLE; -- } -- else if (avbuf->context->streamon) -- ff_v4l2_buffer_enqueue(avbuf); -- } -+ layer = &drm_desc->layers[0]; -+ layer->nb_planes = avbuf->num_planes; -+ -+ for (int i = 0; i < avbuf->num_planes; i++) { -+ layer->planes[i].object_index = i; -+ layer->planes[i].offset = 0; -+ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; -+ } -+ -+ switch (avbuf->context->av_pix_fmt) { -+ case AV_PIX_FMT_YUYV422: -+ -+ layer->format = DRM_FORMAT_YUYV; -+ layer->nb_planes = 1; -+ -+ break; -+ -+ case AV_PIX_FMT_NV12: -+ case AV_PIX_FMT_NV21: -+ -+ layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ? -+ DRM_FORMAT_NV12 : DRM_FORMAT_NV21; -+ -+ if (avbuf->num_planes > 1) -+ break; -+ -+ layer->nb_planes = 2; -+ -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * -+ avbuf->context->format.fmt.pix.height; -+ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; -+ break; -+ -+ case AV_PIX_FMT_YUV420P: -+ -+ layer->format = DRM_FORMAT_YUV420; -+ -+ if (avbuf->num_planes > 1) -+ break; -+ -+ layer->nb_planes = 3; -+ -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * -+ avbuf->context->format.fmt.pix.height; -+ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1; -+ -+ layer->planes[2].object_index = 0; -+ layer->planes[2].offset = layer->planes[1].offset + -+ ((avbuf->plane_info[0].bytesperline * -+ avbuf->context->format.fmt.pix.height) >> 2); -+ layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1; -+ break; - -- av_buffer_unref(&avbuf->context_ref); -+ default: -+ drm_desc->nb_layers = 0; -+ break; - } -+ -+ return (uint8_t *) drm_desc; - } - --static int v4l2_buf_increase_ref(V4L2Buffer *in) -+static void v4l2_free_bufref(void *opaque, uint8_t *data) - { -- V4L2m2mContext *s = buf_to_m2mctx(in); -+ AVBufferRef * bufref = (AVBufferRef *)data; -+ V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data; -+ struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl); - -- if (in->context_ref) -- atomic_fetch_add(&in->context_refcount, 1); -- else { -- in->context_ref = av_buffer_ref(s->self_ref); -- if (!in->context_ref) -- return AVERROR(ENOMEM); -+ if (ctx != NULL) { -+ // Buffer still attached to context -+ V4L2m2mContext *s = buf_to_m2mctx(avbuf); - -- in->context_refcount = 1; -- } -+ ff_mutex_lock(&ctx->lock); - -- in->status = V4L2BUF_RET_USER; -- atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed); -+ avbuf->status = V4L2BUF_AVAILABLE; - -- return 0; -+ if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) { -+ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name); -+ /* no need to queue more buffers to the driver */ -+ } -+ else if (ctx->streamon) { -+ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name); -+ avbuf->buf.timestamp.tv_sec = 0; -+ avbuf->buf.timestamp.tv_usec = 0; -+ ff_v4l2_buffer_enqueue(avbuf); // will set to IN_DRIVER -+ } -+ else { -+ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name); -+ } -+ -+ ff_mutex_unlock(&ctx->lock); -+ } -+ -+ ff_weak_link_unlock(avbuf->context_wl); -+ av_buffer_unref(&bufref); - } - --static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf) -+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) - { -- int ret; -+ struct v4l2_exportbuffer expbuf; -+ int i, ret; - -- if (plane >= in->num_planes) -- return AVERROR(EINVAL); -+ for (i = 0; i < avbuf->num_planes; i++) { -+ memset(&expbuf, 0, sizeof(expbuf)); - -- /* even though most encoders return 0 in data_offset encoding vp8 does require this value */ -- *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset, -- in->plane_info[plane].length, v4l2_free_buffer, in, 0); -- if (!*buf) -- return AVERROR(ENOMEM); -+ expbuf.index = avbuf->buf.index; -+ expbuf.type = avbuf->buf.type; -+ expbuf.plane = i; - -- ret = v4l2_buf_increase_ref(in); -- if (ret) -- av_buffer_unref(buf); -+ ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf); -+ if (ret < 0) -+ return AVERROR(errno); - -- return ret; -+ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) { -+ /* drm frame */ -+ avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length; -+ avbuf->drm_frame.objects[i].fd = expbuf.fd; -+ avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ } else { -+ /* drm frame */ -+ avbuf->drm_frame.objects[0].size = avbuf->buf.length; -+ avbuf->drm_frame.objects[0].fd = expbuf.fd; -+ avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ } -+ } -+ -+ return 0; - } - - static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset) -@@ -285,30 +373,50 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i - - memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset)); - -- if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { -- out->planes[plane].bytesused = bytesused; -- out->planes[plane].length = length; -- } else { -- out->buf.bytesused = bytesused; -- out->buf.length = length; -- } -+ set_buf_length(out, plane, bytesused, length); - - return 0; - } - -+static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf) -+{ -+ AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]); -+ AVBufferRef * newbuf; -+ -+ if (!bufref) -+ return NULL; -+ -+ newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0); -+ if (newbuf == NULL) -+ av_buffer_unref(&bufref); -+ -+ avbuf->status = V4L2BUF_RET_USER; -+ return newbuf; -+} -+ - static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) - { -- int i, ret; -+ int i; - - frame->format = avbuf->context->av_pix_fmt; - -- for (i = 0; i < avbuf->num_planes; i++) { -- ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]); -- if (ret) -- return ret; -+ frame->buf[0] = wrap_avbuf(avbuf); -+ if (frame->buf[0] == NULL) -+ return AVERROR(ENOMEM); -+ -+ if (buf_to_m2mctx(avbuf)->output_drm) { -+ /* 1. get references to the actual data */ -+ frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf); -+ frame->format = AV_PIX_FMT_DRM_PRIME; -+ frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref); -+ return 0; -+ } -+ - -+ /* 1. get references to the actual data */ -+ for (i = 0; i < avbuf->num_planes; i++) { -+ frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset; - frame->linesize[i] = avbuf->plane_info[i].bytesperline; -- frame->data[i] = frame->buf[i]->data; - } - - /* fixup special cases */ -@@ -337,68 +445,95 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) - return 0; - } - -+static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h) -+{ -+ if (dst_stride == src_stride && w + 32 >= dst_stride) { -+ memcpy(dst, src, dst_stride * h); -+ } -+ else { -+ while (--h >= 0) { -+ memcpy(dst, src, w); -+ dst += dst_stride; -+ src += src_stride; -+ } -+ } -+} -+ -+static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes) -+{ -+ return i != 0 && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA)); -+} -+ - static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) - { -- int i, ret; -- struct v4l2_format fmt = out->context->format; -- int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ? -- fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat; -- int height = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ? -- fmt.fmt.pix_mp.height : fmt.fmt.pix.height; -- int is_planar_format = 0; -- -- switch (pixel_format) { -- case V4L2_PIX_FMT_YUV420M: -- case V4L2_PIX_FMT_YVU420M: --#ifdef V4L2_PIX_FMT_YUV422M -- case V4L2_PIX_FMT_YUV422M: --#endif --#ifdef V4L2_PIX_FMT_YVU422M -- case V4L2_PIX_FMT_YVU422M: --#endif --#ifdef V4L2_PIX_FMT_YUV444M -- case V4L2_PIX_FMT_YUV444M: --#endif --#ifdef V4L2_PIX_FMT_YVU444M -- case V4L2_PIX_FMT_YVU444M: --#endif -- case V4L2_PIX_FMT_NV12M: -- case V4L2_PIX_FMT_NV21M: -- case V4L2_PIX_FMT_NV12MT_16X16: -- case V4L2_PIX_FMT_NV12MT: -- case V4L2_PIX_FMT_NV16M: -- case V4L2_PIX_FMT_NV61M: -- is_planar_format = 1; -- } -- -- if (!is_planar_format) { -- const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); -- int planes_nb = 0; -- int offset = 0; -- -- for (i = 0; i < desc->nb_components; i++) -- planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1); -- -- for (i = 0; i < planes_nb; i++) { -- int size, h = height; -- if (i == 1 || i == 2) { -+ int i; -+ int num_planes = 0; -+ int pel_strides[4] = {0}; -+ -+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); -+ -+ if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) { -+ av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__); -+ return -1; -+ } -+ -+ for (i = 0; i != desc->nb_components; ++i) { -+ if (desc->comp[i].plane >= num_planes) -+ num_planes = desc->comp[i].plane + 1; -+ pel_strides[desc->comp[i].plane] = desc->comp[i].step; -+ } -+ -+ if (out->num_planes > 1) { -+ if (num_planes != out->num_planes) { -+ av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes); -+ return -1; -+ } -+ for (i = 0; i != num_planes; ++i) { -+ int w = frame->width; -+ int h = frame->height; -+ if (is_chroma(desc, i, num_planes)) { -+ w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w); - h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h); - } -- size = frame->linesize[i] * h; -- ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset); -- if (ret) -- return ret; -- offset += size; -+ -+ cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline, -+ frame->data[i], frame->linesize[i], -+ w * pel_strides[i], h); -+ set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length); - } -- return 0; - } -+ else -+ { -+ unsigned int offset = 0; -+ -+ for (i = 0; i != num_planes; ++i) { -+ int w = frame->width; -+ int h = frame->height; -+ int dst_stride = out->plane_info[0].bytesperline; -+ uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset; -+ -+ if (is_chroma(desc, i, num_planes)) { -+ // Is chroma -+ dst_stride >>= desc->log2_chroma_w; -+ offset += dst_stride * (out->context->height >> desc->log2_chroma_h); -+ w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w); -+ h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h); -+ } -+ else { -+ // Is luma or alpha -+ offset += dst_stride * out->context->height; -+ } -+ if (offset > out->plane_info[0].length) { -+ av_log(NULL, AV_LOG_ERROR, "%s: Plane total %d > buffer size %d\n", __func__, offset, out->plane_info[0].length); -+ return -1; -+ } - -- for (i = 0; i < out->num_planes; i++) { -- ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0); -- if (ret) -- return ret; -+ cpy_2d(dst, dst_stride, -+ frame->data[i], frame->linesize[i], -+ w * pel_strides[i], h); -+ } -+ set_buf_length(out, 0, offset, out->plane_info[0].length); - } -- - return 0; - } - -@@ -410,14 +545,15 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) - - int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) - { -- v4l2_set_pts(out, frame->pts); -+ v4l2_set_pts(out, frame->pts, 0); - - return v4l2_buffer_swframe_to_buf(frame, out); - } - --int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) -+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_rescale_pts) - { - int ret; -+ V4L2Context * const ctx = avbuf->context; - - av_frame_unref(frame); - -@@ -432,13 +568,22 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) - frame->colorspace = v4l2_get_color_space(avbuf); - frame->color_range = v4l2_get_color_range(avbuf); - frame->color_trc = v4l2_get_color_trc(avbuf); -- frame->pts = v4l2_get_pts(avbuf); -+ frame->pts = v4l2_get_pts(avbuf, no_rescale_pts); - frame->pkt_dts = AV_NOPTS_VALUE; - - /* these values are updated also during re-init in v4l2_process_driver_event */ -- frame->height = avbuf->context->height; -- frame->width = avbuf->context->width; -- frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio; -+ frame->height = ctx->height; -+ frame->width = ctx->width; -+ frame->sample_aspect_ratio = ctx->sample_aspect_ratio; -+ -+ if (ctx->selection.height && ctx->selection.width) { -+ frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0; -+ frame->crop_top = ctx->selection.top < frame->height ? ctx->selection.top : 0; -+ frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ? -+ frame->width - (ctx->selection.left + ctx->selection.width) : 0; -+ frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ? -+ frame->height - (ctx->selection.top + ctx->selection.height) : 0; -+ } - - /* 3. report errors upstream */ - if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) { -@@ -451,15 +596,14 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) - - int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) - { -- int ret; -- - av_packet_unref(pkt); -- ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf); -- if (ret) -- return ret; -+ -+ pkt->buf = wrap_avbuf(avbuf); -+ if (pkt->buf == NULL) -+ return AVERROR(ENOMEM); - - pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused; -- pkt->data = pkt->buf->data; -+ pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset; - - if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME) - pkt->flags |= AV_PKT_FLAG_KEY; -@@ -469,20 +613,27 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) - pkt->flags |= AV_PKT_FLAG_CORRUPT; - } - -- pkt->dts = pkt->pts = v4l2_get_pts(avbuf); -+ pkt->dts = pkt->pts = v4l2_get_pts(avbuf, 0); - - return 0; - } - --int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) -+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, -+ const void *extdata, size_t extlen, int no_rescale_pts) - { - int ret; - -- ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0); -+ if (extlen) { -+ ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0); -+ if (ret) -+ return ret; -+ } -+ -+ ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen); - if (ret) - return ret; - -- v4l2_set_pts(out, pkt->pts); -+ v4l2_set_pts(out, pkt->pts, no_rescale_pts); - - if (pkt->flags & AV_PKT_FLAG_KEY) - out->flags = V4L2_BUF_FLAG_KEYFRAME; -@@ -490,15 +641,61 @@ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) - return 0; - } - --int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) -+int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) -+{ -+ return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0); -+} -+ -+ -+static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data) -+{ -+ V4L2Buffer * const avbuf = (V4L2Buffer *)data; -+ int i; -+ -+ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) { -+ struct V4L2Plane_info *p = avbuf->plane_info + i; -+ if (p->mm_addr != NULL) -+ munmap(p->mm_addr, p->length); -+ } -+ -+ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { -+ if (avbuf->drm_frame.objects[i].fd != -1) -+ close(avbuf->drm_frame.objects[i].fd); -+ } -+ -+ ff_weak_link_unref(&avbuf->context_wl); -+ -+ av_free(avbuf); -+} -+ -+ -+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx) - { -- V4L2Context *ctx = avbuf->context; - int ret, i; -+ V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf)); -+ AVBufferRef * bufref; -+ -+ *pbufref = NULL; -+ if (avbuf == NULL) -+ return AVERROR(ENOMEM); -+ -+ bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0); -+ if (bufref == NULL) { -+ av_free(avbuf); -+ return AVERROR(ENOMEM); -+ } - -+ avbuf->context = ctx; - avbuf->buf.memory = V4L2_MEMORY_MMAP; - avbuf->buf.type = ctx->type; - avbuf->buf.index = index; - -+ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { -+ avbuf->drm_frame.objects[i].fd = -1; -+ } -+ -+ avbuf->context_wl = ff_weak_link_ref(ctx->wl_master); -+ - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - avbuf->buf.length = VIDEO_MAX_PLANES; - avbuf->buf.m.planes = avbuf->planes; -@@ -506,7 +703,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) - - ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf); - if (ret < 0) -- return AVERROR(errno); -+ goto fail; - - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - avbuf->num_planes = 0; -@@ -526,25 +723,33 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) - - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length; -- avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, -- PROT_READ | PROT_WRITE, MAP_SHARED, -- buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); -+ -+ if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) || -+ !buf_to_m2mctx(avbuf)->output_drm) { -+ avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, -+ PROT_READ | PROT_WRITE, MAP_SHARED, -+ buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); -+ } - } else { - avbuf->plane_info[i].length = avbuf->buf.length; -- avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, -- PROT_READ | PROT_WRITE, MAP_SHARED, -- buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); -+ -+ if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) || -+ !buf_to_m2mctx(avbuf)->output_drm) { -+ avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, -+ PROT_READ | PROT_WRITE, MAP_SHARED, -+ buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); -+ } - } - -- if (avbuf->plane_info[i].mm_addr == MAP_FAILED) -- return AVERROR(ENOMEM); -+ if (avbuf->plane_info[i].mm_addr == MAP_FAILED) { -+ avbuf->plane_info[i].mm_addr = NULL; -+ ret = AVERROR(ENOMEM); -+ goto fail; -+ } - } - - avbuf->status = V4L2BUF_AVAILABLE; - -- if (V4L2_TYPE_IS_OUTPUT(ctx->type)) -- return 0; -- - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - avbuf->buf.m.planes = avbuf->planes; - avbuf->buf.length = avbuf->num_planes; -@@ -554,7 +759,20 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) - avbuf->buf.length = avbuf->planes[0].length; - } - -- return ff_v4l2_buffer_enqueue(avbuf); -+ if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { -+ if (buf_to_m2mctx(avbuf)->output_drm) { -+ ret = v4l2_buffer_export_drm(avbuf); -+ if (ret) -+ goto fail; -+ } -+ } -+ -+ *pbufref = bufref; -+ return 0; -+ -+fail: -+ av_buffer_unref(&bufref); -+ return ret; - } - - int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) -@@ -563,9 +781,27 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) - - avbuf->buf.flags = avbuf->flags; - -+ if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) { -+ av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", -+ avbuf->context->name, avbuf->buf.index, -+ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, -+ avbuf->context->q_count); -+ } -+ - ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf); -- if (ret < 0) -- return AVERROR(errno); -+ if (ret < 0) { -+ int err = errno; -+ av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n", -+ avbuf->context->name, avbuf->buf.index, -+ err, strerror(err)); -+ return AVERROR(err); -+ } -+ -+ ++avbuf->context->q_count; -+ av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", -+ avbuf->context->name, avbuf->buf.index, -+ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, -+ avbuf->context->q_count); - - avbuf->status = V4L2BUF_IN_DRIVER; - -diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h -index 3d2ff1b9a5..111526aee3 100644 ---- a/libavcodec/v4l2_buffers.h -+++ b/libavcodec/v4l2_buffers.h -@@ -28,27 +28,37 @@ - #include - #include - -+#include "avcodec.h" - #include "libavutil/buffer.h" - #include "libavutil/frame.h" -+#include "libavutil/hwcontext_drm.h" - #include "packet.h" - - enum V4L2Buffer_status { - V4L2BUF_AVAILABLE, - V4L2BUF_IN_DRIVER, -+ V4L2BUF_IN_USE, - V4L2BUF_RET_USER, - }; - - /** - * V4L2Buffer (wrapper for v4l2_buffer management) - */ -+struct V4L2Context; -+struct ff_weak_link_client; -+ - typedef struct V4L2Buffer { -- /* each buffer needs to have a reference to its context */ -+ /* each buffer needs to have a reference to its context -+ * The pointer is good enough for most operation but once the buffer has -+ * been passed to the user the buffer may become orphaned so for free ops -+ * the weak link must be used to ensure that the context is actually -+ * there -+ */ - struct V4L2Context *context; -+ struct ff_weak_link_client *context_wl; - -- /* This object is refcounted per-plane, so we need to keep track -- * of how many context-refs we are holding. */ -- AVBufferRef *context_ref; -- atomic_uint context_refcount; -+ /* DRM descriptor */ -+ AVDRMFrameDescriptor drm_frame; - - /* keep track of the mmap address and mmap length */ - struct V4L2Plane_info { -@@ -73,11 +83,12 @@ typedef struct V4L2Buffer { - * - * @param[in] frame The AVFRame to push the information to - * @param[in] buf The V4L2Buffer to get the information from -+ * @param[in] no_rescale_pts If non-zero do not rescale PTS - * - * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect, - * AVERROR(ENOMEM) if the AVBufferRef can't be created. - */ --int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf); -+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf, int no_rescale_pts); - - /** - * Extracts the data from a V4L2Buffer to an AVPacket -@@ -101,6 +112,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf); - */ - int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out); - -+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, -+ const void *extdata, size_t extlen, int no_rescale_pts); -+ - /** - * Extracts the data from an AVFrame to a V4L2Buffer - * -@@ -119,7 +133,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out); - * - * @returns 0 in case of success, a negative AVERROR code otherwise - */ --int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index); -+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx); - - /** - * Enqueues a V4L2Buffer -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index a40be94690..be76068af3 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -27,11 +27,13 @@ - #include - #include - #include -+#include "libavutil/avassert.h" - #include "libavcodec/avcodec.h" - #include "decode.h" - #include "v4l2_buffers.h" - #include "v4l2_fmt.h" - #include "v4l2_m2m.h" -+#include "weak_link.h" - - struct v4l2_format_update { - uint32_t v4l2_fmt; -@@ -153,21 +155,99 @@ static inline void v4l2_save_to_context(V4L2Context* ctx, struct v4l2_format_upd - } - } - --static int v4l2_start_decode(V4L2Context *ctx) -+static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r) - { -- struct v4l2_decoder_cmd cmd = { -- .cmd = V4L2_DEC_CMD_START, -- .flags = 0, -+ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); -+ struct v4l2_selection selection = { -+ .type = V4L2_BUF_TYPE_VIDEO_CAPTURE, -+ .target = V4L2_SEL_TGT_COMPOSE - }; -- int ret; - -- ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DECODER_CMD, &cmd); -- if (ret) -+ memset(r, 0, sizeof(*r)); -+ if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection)) - return AVERROR(errno); - -+ *r = selection.r; - return 0; - } - -+static int do_source_change(V4L2m2mContext * const s) -+{ -+ AVCodecContext *const avctx = s->avctx; -+ -+ int ret; -+ int reinit; -+ int full_reinit; -+ struct v4l2_format cap_fmt = s->capture.format; -+ -+ s->resize_pending = 0; -+ s->capture.done = 0; -+ -+ ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt); -+ if (ret) { -+ av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name); -+ return 0; -+ } -+ -+ s->output.sample_aspect_ratio = v4l2_get_sar(&s->output); -+ -+ get_default_selection(&s->capture, &s->capture.selection); -+ -+ reinit = v4l2_resolution_changed(&s->capture, &cap_fmt); -+ if (reinit) { -+ s->capture.height = v4l2_get_height(&cap_fmt); -+ s->capture.width = v4l2_get_width(&cap_fmt); -+ } -+ s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); -+ -+ av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n", -+ s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den, -+ s->capture.selection.width, s->capture.selection.height, -+ s->capture.selection.left, s->capture.selection.top); -+ -+ s->reinit = 1; -+ -+ if (reinit) { -+ if (avctx) -+ ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height); -+ if (ret < 0) -+ av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n"); -+ -+ ret = ff_v4l2_m2m_codec_reinit(s); -+ if (ret) { -+ av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n"); -+ return AVERROR(EINVAL); -+ } -+ goto reinit_run; -+ } -+ -+ /* Buffers are OK so just stream off to ack */ -+ av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only\n", __func__); -+ -+ ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); -+ if (ret) -+ av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n"); -+ s->draining = 0; -+ -+ /* reinit executed */ -+reinit_run: -+ ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON); -+ return 1; -+} -+ -+static int ctx_done(V4L2Context * const ctx) -+{ -+ int rv = 0; -+ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); -+ -+ ctx->done = 1; -+ -+ if (s->resize_pending && !V4L2_TYPE_IS_OUTPUT(ctx->type)) -+ rv = do_source_change(s); -+ -+ return rv; -+} -+ - /** - * handle resolution change event and end of stream event - * returns 1 if reinit was successful, negative if it failed -@@ -175,8 +255,7 @@ static int v4l2_start_decode(V4L2Context *ctx) - */ - static int v4l2_handle_event(V4L2Context *ctx) - { -- V4L2m2mContext *s = ctx_to_m2mctx(ctx); -- struct v4l2_format cap_fmt = s->capture.format; -+ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); - struct v4l2_event evt = { 0 }; - int ret; - -@@ -186,44 +265,22 @@ static int v4l2_handle_event(V4L2Context *ctx) - return 0; - } - -+ av_log(logger(ctx), AV_LOG_INFO, "Dq event %d\n", evt.type); -+ - if (evt.type == V4L2_EVENT_EOS) { -- ctx->done = 1; -+// ctx->done = 1; -+ av_log(logger(ctx), AV_LOG_TRACE, "%s VIDIOC_EVENT_EOS\n", ctx->name); - return 0; - } - - if (evt.type != V4L2_EVENT_SOURCE_CHANGE) - return 0; - -- ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt); -- if (ret) { -- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name); -- return 0; -- } -- -- if (v4l2_resolution_changed(&s->capture, &cap_fmt)) { -- s->capture.height = v4l2_get_height(&cap_fmt); -- s->capture.width = v4l2_get_width(&cap_fmt); -- s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); -- } else { -- v4l2_start_decode(ctx); -+ s->resize_pending = 1; -+ if (!ctx->done) - return 0; -- } -- -- s->reinit = 1; -- -- if (s->avctx) -- ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height); -- if (ret < 0) -- av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n"); -- -- ret = ff_v4l2_m2m_codec_reinit(s); -- if (ret) { -- av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n"); -- return AVERROR(EINVAL); -- } - -- /* reinit executed */ -- return 1; -+ return do_source_change(s); - } - - static int v4l2_stop_decode(V4L2Context *ctx) -@@ -266,8 +323,26 @@ static int v4l2_stop_encode(V4L2Context *ctx) - return 0; - } - -+static int count_in_driver(const V4L2Context * const ctx) -+{ -+ int i; -+ int n = 0; -+ -+ if (!ctx->bufrefs) -+ return -1; -+ -+ for (i = 0; i < ctx->num_buffers; ++i) { -+ V4L2Buffer *const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -+ if (avbuf->status == V4L2BUF_IN_DRIVER) -+ ++n; -+ } -+ return n; -+} -+ - static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) - { -+ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); -+ const int is_capture = !V4L2_TYPE_IS_OUTPUT(ctx->type); - struct v4l2_plane planes[VIDEO_MAX_PLANES]; - struct v4l2_buffer buf = { 0 }; - V4L2Buffer *avbuf; -@@ -276,50 +351,84 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) - .fd = ctx_to_m2mctx(ctx)->fd, - }; - int i, ret; -+ int no_rx_means_done = 0; - -- if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) { -+ if (is_capture && ctx->bufrefs) { - for (i = 0; i < ctx->num_buffers; i++) { -- if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER) -+ avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -+ if (avbuf->status == V4L2BUF_IN_DRIVER) - break; - } - if (i == ctx->num_buffers) -- av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to " -+ av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers (%d) returned to " - "userspace. Increase num_capture_buffers " - "to prevent device deadlock or dropped " -- "packets/frames.\n"); -+ "packets/frames.\n", i); - } - -+#if 0 -+ // I think this is true but pointless -+ // we will get some other form of EOF signal -+ - /* if we are draining and there are no more capture buffers queued in the driver we are done */ -- if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) { -+ if (is_capture && ctx_to_m2mctx(ctx)->draining) { - for (i = 0; i < ctx->num_buffers; i++) { - /* capture buffer initialization happens during decode hence - * detection happens at runtime - */ -- if (!ctx->buffers) -+ if (!ctx->bufrefs) - break; - -- if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER) -+ avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -+ if (avbuf->status == V4L2BUF_IN_DRIVER) - goto start; - } - ctx->done = 1; - return NULL; - } -+#endif - - start: -- if (V4L2_TYPE_IS_OUTPUT(ctx->type)) -- pfd.events = POLLOUT | POLLWRNORM; -- else { -+ if (is_capture) { - /* no need to listen to requests for more input while draining */ - if (ctx_to_m2mctx(ctx)->draining) - pfd.events = POLLIN | POLLRDNORM | POLLPRI; -+ } else { -+ pfd.events = POLLOUT | POLLWRNORM; - } -+ no_rx_means_done = s->resize_pending && is_capture; - - for (;;) { -- ret = poll(&pfd, 1, timeout); -+ // If we have a resize pending then all buffers should be Qed -+ // With a resize pending we should be in drain but evidence suggests -+ // that not all decoders do this so poll to clear -+ int t2 = no_rx_means_done ? 0 : timeout < 0 ? 3000 : timeout; -+ const int e = pfd.events; -+ -+ ret = poll(&pfd, 1, t2); -+ - if (ret > 0) - break; -- if (errno == EINTR) -- continue; -+ -+ if (ret < 0) { -+ int err = errno; -+ if (err == EINTR) -+ continue; -+ av_log(logger(ctx), AV_LOG_ERROR, "=== poll error %d (%s): events=%#x, cap buffers=%d\n", -+ err, strerror(err), -+ e, count_in_driver(ctx)); -+ return NULL; -+ } -+ -+ // ret == 0 (timeout) -+ if (no_rx_means_done) { -+ av_log(logger(ctx), AV_LOG_DEBUG, "Ctx done on timeout\n"); -+ ret = ctx_done(ctx); -+ if (ret > 0) -+ goto start; -+ } -+ if (timeout == -1) -+ av_log(logger(ctx), AV_LOG_ERROR, "=== poll unexpected TIMEOUT: events=%#x, cap buffers=%d\n", e, count_in_driver(ctx));; - return NULL; - } - -@@ -329,7 +438,8 @@ start: - no need to raise a warning */ - if (timeout == 0) { - for (i = 0; i < ctx->num_buffers; i++) { -- if (ctx->buffers[i].status != V4L2BUF_AVAILABLE) -+ avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -+ if (avbuf->status != V4L2BUF_AVAILABLE) - av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); - } - } -@@ -347,22 +457,25 @@ start: - ctx->done = 1; - return NULL; - } -- if (ret) { -- /* if re-init was successful drop the buffer (if there was one) -- * since we had to reconfigure capture (unmap all buffers) -- */ -- return NULL; -- } -+ if (ret > 0) -+ goto start; - } - - /* 2. dequeue the buffer */ - if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) { - -- if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { -+ if (is_capture) { - /* there is a capture buffer ready */ - if (pfd.revents & (POLLIN | POLLRDNORM)) - goto dequeue; - -+ // CAPTURE Q drained -+ if (no_rx_means_done) { -+ if (ctx_done(ctx) > 0) -+ goto start; -+ return NULL; -+ } -+ - /* the driver is ready to accept more input; instead of waiting for the capture - * buffer to complete we return NULL so input can proceed (we are single threaded) - */ -@@ -380,37 +493,58 @@ dequeue: - buf.m.planes = planes; - } - -- ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf); -- if (ret) { -- if (errno != EAGAIN) { -- ctx->done = 1; -- if (errno != EPIPE) -+ while ((ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf)) == -1) { -+ const int err = errno; -+ if (err == EINTR) -+ continue; -+ if (err != EAGAIN) { -+ // EPIPE on CAPTURE can be used instead of BUF_FLAG_LAST -+ if (err != EPIPE || !is_capture) - av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", -- ctx->name, av_err2str(AVERROR(errno))); -+ ctx->name, av_err2str(AVERROR(err))); -+ if (ctx_done(ctx) > 0) -+ goto start; - } - return NULL; - } -+ --ctx->q_count; -+ av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d\n", -+ ctx->name, buf.index, -+ buf.timestamp.tv_sec, buf.timestamp.tv_usec, -+ ctx->q_count, ++ctx->dq_count); - -- if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) { -+ avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; -+ avbuf->status = V4L2BUF_AVAILABLE; -+ avbuf->buf = buf; -+ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { -+ memcpy(avbuf->planes, planes, sizeof(planes)); -+ avbuf->buf.m.planes = avbuf->planes; -+ } -+ -+ if (ctx_to_m2mctx(ctx)->draining && is_capture) { - int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ? - buf.m.planes[0].bytesused : buf.bytesused; - if (bytesused == 0) { -- ctx->done = 1; -+ av_log(logger(ctx), AV_LOG_DEBUG, "Buffer empty - reQ\n"); -+ -+ // Must reQ so we don't leak -+ // May not matter if the next thing we do is release all the -+ // buffers but better to be tidy. -+ ff_v4l2_buffer_enqueue(avbuf); -+ -+ if (ctx_done(ctx) > 0) -+ goto start; - return NULL; - } - #ifdef V4L2_BUF_FLAG_LAST -- if (buf.flags & V4L2_BUF_FLAG_LAST) -- ctx->done = 1; -+ if (buf.flags & V4L2_BUF_FLAG_LAST) { -+ av_log(logger(ctx), AV_LOG_TRACE, "FLAG_LAST set\n"); -+ avbuf->status = V4L2BUF_IN_USE; // Avoid flushing this buffer -+ ctx_done(ctx); -+ } - #endif - } - -- avbuf = &ctx->buffers[buf.index]; -- avbuf->status = V4L2BUF_AVAILABLE; -- avbuf->buf = buf; -- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { -- memcpy(avbuf->planes, planes, sizeof(planes)); -- avbuf->buf.m.planes = avbuf->planes; -- } - return avbuf; - } - -@@ -429,8 +563,9 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) - } - - for (i = 0; i < ctx->num_buffers; i++) { -- if (ctx->buffers[i].status == V4L2BUF_AVAILABLE) -- return &ctx->buffers[i]; -+ V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -+ if (avbuf->status == V4L2BUF_AVAILABLE) -+ return avbuf; - } - - return NULL; -@@ -438,25 +573,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) - - static int v4l2_release_buffers(V4L2Context* ctx) - { -- struct v4l2_requestbuffers req = { -- .memory = V4L2_MEMORY_MMAP, -- .type = ctx->type, -- .count = 0, /* 0 -> unmaps buffers from the driver */ -- }; -- int i, j; -+ int i; -+ int ret = 0; -+ const int fd = ctx_to_m2mctx(ctx)->fd; - -- for (i = 0; i < ctx->num_buffers; i++) { -- V4L2Buffer *buffer = &ctx->buffers[i]; -+ // Orphan any buffers in the wild -+ ff_weak_link_break(&ctx->wl_master); -+ -+ if (ctx->bufrefs) { -+ for (i = 0; i < ctx->num_buffers; i++) -+ av_buffer_unref(ctx->bufrefs + i); -+ } -+ -+ if (fd != -1) { -+ struct v4l2_requestbuffers req = { -+ .memory = V4L2_MEMORY_MMAP, -+ .type = ctx->type, -+ .count = 0, /* 0 -> unmap all buffers from the driver */ -+ }; -+ -+ while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) { -+ if (errno == EINTR) -+ continue; -+ -+ ret = AVERROR(errno); - -- for (j = 0; j < buffer->num_planes; j++) { -- struct V4L2Plane_info *p = &buffer->plane_info[j]; -- if (p->mm_addr && p->length) -- if (munmap(p->mm_addr, p->length) < 0) -- av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno))); -+ av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n", -+ ctx->name, av_err2str(AVERROR(errno))); -+ -+ if (ctx_to_m2mctx(ctx)->output_drm) -+ av_log(logger(ctx), AV_LOG_ERROR, -+ "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n" -+ "for all buffers: \n" -+ " 1. drmModeRmFB(..)\n" -+ " 2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n"); - } - } -+ ctx->q_count = 0; - -- return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req); -+ return ret; - } - - static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt) -@@ -485,6 +640,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm - - static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) - { -+ V4L2m2mContext* s = ctx_to_m2mctx(ctx); -+ V4L2m2mPriv *priv = s->avctx->priv_data; - enum AVPixelFormat pixfmt = ctx->av_pix_fmt; - struct v4l2_fmtdesc fdesc; - int ret; -@@ -503,6 +660,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) - if (ret) - return AVERROR(EINVAL); - -+ if (priv->pix_fmt != AV_PIX_FMT_NONE) { -+ if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) { -+ fdesc.index++; -+ continue; -+ } -+ } -+ - pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO); - ret = v4l2_try_raw_format(ctx, pixfmt); - if (ret){ -@@ -555,18 +719,73 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p) - * - *****************************************************************************/ - -+ -+static void flush_all_buffers_status(V4L2Context* const ctx) -+{ -+ int i; -+ for (i = 0; i < ctx->num_buffers; ++i) { -+ struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; -+ if (buf->status == V4L2BUF_IN_DRIVER) -+ buf->status = V4L2BUF_AVAILABLE; -+ } -+ ctx->q_count = 0; -+} -+ -+static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx) -+{ -+ int i; -+ int rv; -+ -+ if (!ctx->bufrefs) { -+ rv = ff_v4l2_context_init(ctx); -+ if (rv) { -+ av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n"); -+ return rv; -+ } -+ } -+ -+ for (i = 0; i < ctx->num_buffers; ++i) { -+ struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; -+ if (buf->status == V4L2BUF_AVAILABLE) { -+ rv = ff_v4l2_buffer_enqueue(buf); -+ if (rv < 0) -+ return rv; -+ } -+ } -+ return 0; -+} -+ - int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) - { - int type = ctx->type; - int ret; -+ AVCodecContext * const avctx = logger(ctx); -+ -+ ff_mutex_lock(&ctx->lock); -+ -+ if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type)) -+ stuff_all_buffers(avctx, ctx); - - ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type); -- if (ret < 0) -- return AVERROR(errno); -+ if (ret < 0) { -+ const int err = errno; -+ av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name, -+ cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err); -+ ret = AVERROR(err); -+ } -+ else -+ { -+ if (cmd == VIDIOC_STREAMOFF) -+ flush_all_buffers_status(ctx); - -- ctx->streamon = (cmd == VIDIOC_STREAMON); -+ ctx->streamon = (cmd == VIDIOC_STREAMON); -+ av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name, -+ cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF"); -+ } - -- return 0; -+ ff_mutex_unlock(&ctx->lock); -+ -+ return ret; - } - - int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) -@@ -594,7 +813,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) - return ff_v4l2_buffer_enqueue(avbuf); - } - --int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) -+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, -+ const void * extdata, size_t extlen, int no_rescale_pts) - { - V4L2m2mContext *s = ctx_to_m2mctx(ctx); - V4L2Buffer* avbuf; -@@ -602,8 +822,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) - - if (!pkt->size) { - ret = v4l2_stop_decode(ctx); -+ // Log but otherwise ignore stop failure - if (ret) -- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name); -+ av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret); - s->draining = 1; - return 0; - } -@@ -612,14 +833,14 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) - if (!avbuf) - return AVERROR(EAGAIN); - -- ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf); -+ ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts); - if (ret) - return ret; - - return ff_v4l2_buffer_enqueue(avbuf); - } - --int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) -+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, int no_rescale_pts) - { - V4L2Buffer *avbuf; - -@@ -636,7 +857,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) - return AVERROR(EAGAIN); - } - -- return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); -+ return ff_v4l2_buffer_buf_to_avframe(frame, avbuf, no_rescale_pts); - } - - int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) -@@ -695,54 +916,57 @@ void ff_v4l2_context_release(V4L2Context* ctx) - { - int ret; - -- if (!ctx->buffers) -+ if (!ctx->bufrefs) - return; - - ret = v4l2_release_buffers(ctx); - if (ret) - av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name); - -- av_freep(&ctx->buffers); -+ av_freep(&ctx->bufrefs); -+ av_buffer_unref(&ctx->frames_ref); -+ -+ ff_mutex_destroy(&ctx->lock); - } - --int ff_v4l2_context_init(V4L2Context* ctx) -+ -+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers) - { -- V4L2m2mContext *s = ctx_to_m2mctx(ctx); -+ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); - struct v4l2_requestbuffers req; -- int ret, i; -- -- if (!v4l2_type_supported(ctx)) { -- av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type); -- return AVERROR_PATCHWELCOME; -- } -- -- ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format); -- if (ret) -- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name); -+ int ret; -+ int i; - - memset(&req, 0, sizeof(req)); -- req.count = ctx->num_buffers; -+ req.count = req_buffers; - req.memory = V4L2_MEMORY_MMAP; - req.type = ctx->type; -- ret = ioctl(s->fd, VIDIOC_REQBUFS, &req); -- if (ret < 0) { -- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno)); -- return AVERROR(errno); -+ while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) { -+ if (errno != EINTR) { -+ ret = AVERROR(errno); -+ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret)); -+ return ret; -+ } - } - - ctx->num_buffers = req.count; -- ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer)); -- if (!ctx->buffers) { -+ ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs)); -+ if (!ctx->bufrefs) { - av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name); -- return AVERROR(ENOMEM); -+ goto fail_release; - } - -- for (i = 0; i < req.count; i++) { -- ctx->buffers[i].context = ctx; -- ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i); -- if (ret < 0) { -+ ctx->wl_master = ff_weak_link_new(ctx); -+ if (!ctx->wl_master) { -+ ret = AVERROR(ENOMEM); -+ goto fail_release; -+ } -+ -+ for (i = 0; i < ctx->num_buffers; i++) { -+ ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx); -+ if (ret) { - av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret)); -- goto error; -+ goto fail_release; - } - } - -@@ -756,10 +980,62 @@ int ff_v4l2_context_init(V4L2Context* ctx) - - return 0; - --error: -+fail_release: - v4l2_release_buffers(ctx); -+ av_freep(&ctx->bufrefs); -+ return ret; -+} -+ -+int ff_v4l2_context_init(V4L2Context* ctx) -+{ -+ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); -+ int ret; -+ -+ // It is not valid to reinit a context without a previous release -+ av_assert0(ctx->bufrefs == NULL); -+ -+ if (!v4l2_type_supported(ctx)) { -+ av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type); -+ return AVERROR_PATCHWELCOME; -+ } -+ -+ ff_mutex_init(&ctx->lock, NULL); - -- av_freep(&ctx->buffers); -+ if (s->output_drm) { -+ AVHWFramesContext *hwframes; -+ -+ ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref); -+ if (!ctx->frames_ref) { -+ ret = AVERROR(ENOMEM); -+ goto fail_unlock; -+ } -+ -+ hwframes = (AVHWFramesContext*)ctx->frames_ref->data; -+ hwframes->format = AV_PIX_FMT_DRM_PRIME; -+ hwframes->sw_format = ctx->av_pix_fmt; -+ hwframes->width = ctx->width; -+ hwframes->height = ctx->height; -+ ret = av_hwframe_ctx_init(ctx->frames_ref); -+ if (ret < 0) -+ goto fail_unref_hwframes; -+ } -+ -+ ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format); -+ if (ret) { -+ ret = AVERROR(errno); -+ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret)); -+ goto fail_unref_hwframes; -+ } -+ -+ ret = create_buffers(ctx, ctx->num_buffers); -+ if (ret < 0) -+ goto fail_unref_hwframes; -+ -+ return 0; - -+fail_unref_hwframes: -+ av_buffer_unref(&ctx->frames_ref); -+fail_unlock: -+ ff_mutex_destroy(&ctx->lock); - return ret; - } -diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h -index 6f7460c89a..59009d11d1 100644 ---- a/libavcodec/v4l2_context.h -+++ b/libavcodec/v4l2_context.h -@@ -32,6 +32,8 @@ - #include "libavutil/rational.h" - #include "codec_id.h" - #include "packet.h" -+#include "libavutil/buffer.h" -+#include "libavutil/thread.h" - #include "v4l2_buffers.h" - - typedef struct V4L2Context { -@@ -71,11 +73,12 @@ typedef struct V4L2Context { - */ - int width, height; - AVRational sample_aspect_ratio; -+ struct v4l2_rect selection; - - /** -- * Indexed array of V4L2Buffers -+ * Indexed array of pointers to V4L2Buffers - */ -- V4L2Buffer *buffers; -+ AVBufferRef **bufrefs; - - /** - * Readonly after init. -@@ -93,6 +96,12 @@ typedef struct V4L2Context { - */ - int done; - -+ AVBufferRef *frames_ref; -+ int q_count; -+ int dq_count; -+ struct ff_weak_link_master *wl_master; -+ -+ AVMutex lock; - } V4L2Context; - - /** -@@ -157,9 +166,12 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); - * @param[in] ctx The V4L2Context to dequeue from. - * @param[inout] f The AVFrame to dequeue to. - * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds) -+ * @param[in] no_rescale_pts (0 rescale pts, 1 use pts as -+ * timestamp directly) -+ * - * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. - */ --int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); -+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int no_rescale_pts); - - /** - * Enqueues a buffer to a V4L2Context from an AVPacket -@@ -171,7 +183,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); - * @param[in] pkt A pointer to an AVPacket. - * @return 0 in case of success, a negative error otherwise. - */ --int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt); -+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size, int no_rescale_pts); - - /** - * Enqueues a buffer to a V4L2Context from an AVFrame -diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c -index 602efb7a16..516e6d9858 100644 ---- a/libavcodec/v4l2_m2m.c -+++ b/libavcodec/v4l2_m2m.c -@@ -216,13 +216,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s) - av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n"); - - /* 2. unmap the capture buffers (v4l2 and ffmpeg): -- * we must wait for all references to be released before being allowed -- * to queue new buffers. - */ -- av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n"); -- if (atomic_load(&s->refcount)) -- while(sem_wait(&s->refsync) == -1 && errno == EINTR); -- - ff_v4l2_context_release(&s->capture); - - /* 3. get the new capture format */ -@@ -259,6 +253,8 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context) - av_frame_free(&s->frame); - av_packet_unref(&s->buf_pkt); - -+ av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n"); -+ - av_free(s); - } - -@@ -270,6 +266,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) - if (!s) - return 0; - -+ av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n"); -+ -+ if (av_codec_is_decoder(s->avctx->codec)) -+ av_packet_unref(&s->buf_pkt); -+ - if (s->fd >= 0) { - ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF); - if (ret) -@@ -282,7 +283,14 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) - - ff_v4l2_context_release(&s->output); - -+ close(s->fd); -+ s->fd = -1; -+ - s->self_ref = NULL; -+ // This is only called on avctx close so after this point we don't have that -+ // Crash sooner if we find we are using it (can still log with avctx = NULL) -+ s->avctx = NULL; -+ priv->context = NULL; - av_buffer_unref(&priv->context_ref); - - return 0; -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index 04d86d7b92..24a9c94864 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -30,6 +30,7 @@ - #include - - #include "libavcodec/avcodec.h" -+#include "libavutil/pixfmt.h" - #include "v4l2_context.h" - - #define container_of(ptr, type, member) ({ \ -@@ -40,6 +41,17 @@ - { "num_output_buffers", "Number of buffers in the output context",\ - OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS } - -+#define FF_V4L2_M2M_TRACK_SIZE 128 -+typedef struct V4L2m2mTrackEl { -+ int discard; // If we see this buffer its been flushed, so discard -+ int pkt_size; -+ int64_t pts; -+ int64_t reordered_opaque; -+ int64_t pkt_pos; -+ int64_t pkt_duration; -+ int64_t track_pts; -+} V4L2m2mTrackEl; -+ - typedef struct V4L2m2mContext { - char devname[PATH_MAX]; - int fd; -@@ -53,6 +65,7 @@ typedef struct V4L2m2mContext { - sem_t refsync; - atomic_uint refcount; - int reinit; -+ int resize_pending; - - /* null frame/packet received */ - int draining; -@@ -66,6 +79,23 @@ typedef struct V4L2m2mContext { - - /* reference back to V4L2m2mPriv */ - void *priv; -+ -+ AVBufferRef *device_ref; -+ -+ /* generate DRM frames */ -+ int output_drm; -+ -+ /* Frame tracking */ -+ int64_t last_pkt_dts; -+ int64_t last_opaque; -+ unsigned int track_no; -+ V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; -+ -+ /* req pkt */ -+ int req_pkt; -+ -+ /* Ext data sent */ -+ int extdata_sent; - } V4L2m2mContext; - - typedef struct V4L2m2mPriv { -@@ -76,6 +106,7 @@ typedef struct V4L2m2mPriv { - - int num_output_buffers; - int num_capture_buffers; -+ enum AVPixelFormat pix_fmt; - } V4L2m2mPriv; - - /** -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 4944d08511..7f6033ac2c 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -23,6 +23,10 @@ - - #include - #include -+ -+#include "libavutil/avassert.h" -+#include "libavutil/hwcontext.h" -+#include "libavutil/hwcontext_drm.h" - #include "libavutil/pixfmt.h" - #include "libavutil/pixdesc.h" - #include "libavutil/opt.h" -@@ -30,26 +34,51 @@ - #include "codec_internal.h" - #include "libavcodec/decode.h" - -+#include "libavcodec/hwaccels.h" -+#include "libavcodec/internal.h" -+#include "libavcodec/hwconfig.h" -+ - #include "v4l2_context.h" - #include "v4l2_m2m.h" - #include "v4l2_fmt.h" - -+static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s) -+{ -+ int ret; -+ struct v4l2_decoder_cmd cmd = { -+ .cmd = V4L2_DEC_CMD_START, -+ .flags = 0, -+ }; -+ -+ if (s->output.streamon) -+ return 0; -+ -+ ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON); -+ if (ret < 0) -+ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n"); -+ -+ if (!s->capture.streamon || ret < 0) -+ return ret; -+ -+ ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd); -+ if (ret < 0) -+ av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno); -+ else -+ av_log(avctx, AV_LOG_DEBUG, "VIDIOC_DECODER_CMD start OK\n"); -+ -+ return ret; -+} -+ - static int v4l2_try_start(AVCodecContext *avctx) - { - V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; - V4L2Context *const capture = &s->capture; -- V4L2Context *const output = &s->output; - struct v4l2_selection selection = { 0 }; - int ret; - - /* 1. start the output process */ -- if (!output->streamon) { -- ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON); -- if (ret < 0) { -- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n"); -- return ret; -- } -- } -+ if ((ret = check_output_streamon(avctx, s)) != 0) -+ return ret; - - if (capture->streamon) - return 0; -@@ -63,15 +92,29 @@ static int v4l2_try_start(AVCodecContext *avctx) - } - - /* 2.1 update the AVCodecContext */ -- avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO); -- capture->av_pix_fmt = avctx->pix_fmt; -+ capture->av_pix_fmt = -+ ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO); -+ if (s->output_drm) { -+ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; -+ avctx->sw_pix_fmt = capture->av_pix_fmt; -+ } -+ else -+ avctx->pix_fmt = capture->av_pix_fmt; - - /* 3. set the crop parameters */ -+#if 1 -+ selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; -+ selection.target = V4L2_SEL_TGT_CROP_DEFAULT; -+ ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection); -+ av_log(avctx, AV_LOG_INFO, "Post G selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height); -+#else - selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; - selection.r.height = avctx->coded_height; - selection.r.width = avctx->coded_width; -+ av_log(avctx, AV_LOG_INFO, "Try selection %dx%d\n", avctx->coded_width, avctx->coded_height); - ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection); -- if (!ret) { -+ av_log(avctx, AV_LOG_INFO, "Post S selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height); -+ if (1) { - ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection); - if (ret) { - av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n"); -@@ -82,15 +125,7 @@ static int v4l2_try_start(AVCodecContext *avctx) - capture->width = selection.r.width; - } - } -- -- /* 4. init the capture context now that we have the capture format */ -- if (!capture->buffers) { -- ret = ff_v4l2_context_init(capture); -- if (ret) { -- av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n"); -- return AVERROR(ENOMEM); -- } -- } -+#endif - - /* 5. start the capture process */ - ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); -@@ -133,50 +168,287 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s) - return 0; - } - --static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) -+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n) -+{ -+ return (int64_t)n; -+} -+ -+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts) -+{ -+ return (unsigned int)pts; -+} -+ -+// FFmpeg requires us to propagate a number of vars from the coded pkt into -+// the decoded frame. The only thing that tracks like that in V4L2 stateful -+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no -+// guarantees about PTS being unique or specified for every frame so replace -+// the supplied PTS with a simple incrementing number and keep a circular -+// buffer of all the things we want preserved (including the original PTS) -+// indexed by the tracking no. -+static void -+xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *const avpkt) -+{ -+ int64_t track_pts; -+ -+ // Avoid 0 -+ if (++s->track_no == 0) -+ s->track_no = 1; -+ -+ track_pts = track_to_pts(avctx, s->track_no); -+ -+ av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, s->track_no); -+ s->last_pkt_dts = avpkt->dts; -+ s->track_els[s->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ -+ .discard = 0, -+ .pkt_size = avpkt->size, -+ .pts = avpkt->pts, -+ .reordered_opaque = avctx->reordered_opaque, -+ .pkt_pos = avpkt->pos, -+ .pkt_duration = avpkt->duration, -+ .track_pts = track_pts -+ }; -+ avpkt->pts = track_pts; -+} -+ -+// Returns -1 if we should discard the frame -+static int -+xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *const frame) -+{ -+ unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE; -+ const V4L2m2mTrackEl *const t = s->track_els + n; -+ if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts) -+ { -+ av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); -+ frame->pts = AV_NOPTS_VALUE; -+ frame->pkt_dts = s->last_pkt_dts; -+ frame->reordered_opaque = s->last_opaque; -+ frame->pkt_pos = -1; -+ frame->pkt_duration = 0; -+ frame->pkt_size = -1; -+ } -+ else if (!t->discard) -+ { -+ frame->pts = t->pts; -+ frame->pkt_dts = s->last_pkt_dts; -+ frame->reordered_opaque = t->reordered_opaque; -+ frame->pkt_pos = t->pkt_pos; -+ frame->pkt_duration = t->pkt_duration; -+ frame->pkt_size = t->pkt_size; -+ -+ s->last_opaque = s->track_els[n].reordered_opaque; -+ s->track_els[n].pts = AV_NOPTS_VALUE; // If we hit this again deny accurate knowledge of PTS -+ } -+ else -+ { -+ av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); -+ return -1; -+ } -+ -+ frame->best_effort_timestamp = frame->pts; -+ frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? -+ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 ", DTS=%" PRId64 "\n", frame->pts, frame->pkt_dts); -+ return 0; -+} -+ -+static inline int stream_started(const V4L2m2mContext * const s) { -+ return s->capture.streamon && s->output.streamon; -+} -+ -+#define NQ_OK 0 -+#define NQ_Q_FULL 1 -+#define NQ_SRC_EMPTY 2 -+#define NQ_DRAINING 3 -+#define NQ_DEAD 4 -+ -+#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING) -+ -+// AVERROR_EOF Flushing an already flushed stream -+// -ve Error (all errors except EOF are unexpected) -+// NQ_OK (0) OK -+// NQ_Q_FULL Dst full (retry if we think V4L2 Q has space now) -+// NQ_SRC_EMPTY Src empty (do not retry) -+// NQ_DRAINING At EOS, dQ dest until EOS there too -+// NQ_DEAD Not running (do not retry, do not attempt capture dQ) -+ -+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s) - { -- V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; -- V4L2Context *const capture = &s->capture; -- V4L2Context *const output = &s->output; - int ret; - -+ // If we don't already have a coded packet - get a new one -+ // We will already have a coded pkt if the output Q was full last time we -+ // tried to Q it - if (!s->buf_pkt.size) { - ret = ff_decode_get_packet(avctx, &s->buf_pkt); -+ -+ if (ret == AVERROR(EAGAIN)) { -+ if (!stream_started(s)) { -+ av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__); -+ return NQ_DEAD; -+ } -+ return NQ_SRC_EMPTY; -+ } -+ -+ if (ret == AVERROR_EOF) { -+ // EOF - enter drain mode -+ av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n", -+ ret, s->buf_pkt.size, stream_started(s), s->draining); -+ if (!stream_started(s)) { -+ av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n"); -+ s->draining = 1; -+ s->capture.done = 1; -+ return AVERROR_EOF; -+ } -+ -+ if (!s->draining) { -+ // Calling enqueue with an empty pkt starts drain -+ av_assert0(s->buf_pkt.size == 0); -+ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0, 1); -+ if (ret) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret); -+ return ret; -+ } -+ } -+ return NQ_DRAINING; -+ } -+ - if (ret < 0) { -- if (ret == AVERROR(EAGAIN)) -- return ff_v4l2_context_dequeue_frame(capture, frame, 0); -- else if (ret != AVERROR_EOF) -- return ret; -+ av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret); -+ return ret; - } -+ -+ xlat_pts_in(avctx, s, &s->buf_pkt); - } - -- if (s->draining) -- goto dequeue; -+ if ((ret = check_output_streamon(avctx, s)) != 0) -+ return ret; - -- ret = ff_v4l2_context_enqueue_packet(output, &s->buf_pkt); -- if (ret < 0 && ret != AVERROR(EAGAIN)) -- goto fail; -+ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, -+ avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size, -+ 1); - -- /* if EAGAIN don't unref packet and try to enqueue in the next iteration */ -- if (ret != AVERROR(EAGAIN)) -+ if (ret == AVERROR(EAGAIN)) { -+ // Out of input buffers - keep packet -+ ret = NQ_Q_FULL; -+ } -+ else { -+ // In all other cases we are done with this packet - av_packet_unref(&s->buf_pkt); -+ s->extdata_sent = 1; - -- if (!s->draining) { -- ret = v4l2_try_start(avctx); - if (ret) { -- /* cant recover */ -- if (ret != AVERROR(ENOMEM)) -- ret = 0; -- goto fail; -+ av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret); -+ return ret; -+ } -+ } -+ -+ // Start if we haven't -+ { -+ const int ret2 = v4l2_try_start(avctx); -+ if (ret2) { -+ av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2); -+ ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD; -+ } -+ } -+ -+ return ret; -+} -+ -+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) -+{ -+ V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; -+ int src_rv; -+ int dst_rv = 1; // Non-zero (done), non-negative (error) number -+ -+ do { -+ src_rv = try_enqueue_src(avctx, s); -+ -+ // If we got a frame last time and we have nothing to enqueue then -+ // return now. rv will be AVERROR(EAGAIN) indicating that we want more input -+ // This should mean that once decode starts we enter a stable state where -+ // we alternately ask for input and produce output -+ if (s->req_pkt && src_rv == NQ_SRC_EMPTY) -+ break; -+ -+ if (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) { -+ av_log(avctx, AV_LOG_WARNING, "Poll says src Q has space but enqueue fail"); -+ src_rv = NQ_SRC_EMPTY; // If we can't enqueue pretend that there is nothing to enqueue -+ } -+ -+ // Try to get a new frame if -+ // (a) we haven't already got one AND -+ // (b) enqueue returned a status indicating that decode should be attempted -+ if (dst_rv != 0 && TRY_DQ(src_rv)) { -+ do { -+ // Dequeue frame will unref any previous contents of frame -+ // if it returns success so we don't need an explicit unref -+ // when discarding -+ // This returns AVERROR(EAGAIN) if there isn't a frame ready yet -+ // but there is room in the input Q -+ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, -1, 1); -+ -+ if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) -+ av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", -+ s->draining, s->capture.done); -+ else if (dst_rv && dst_rv != AVERROR(EAGAIN)) -+ av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n", -+ s->draining, s->capture.done, dst_rv); -+ -+ // Go again if we got a frame that we need to discard -+ } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame)); -+ } -+ -+ // Continue trying to enqueue packets if either -+ // (a) we succeeded last time OR -+ // (b) enqueue failed due to input Q full AND there is now room -+ } while (src_rv == NQ_OK || (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) ); -+ -+ // Ensure that the frame contains nothing if we aren't returning a frame -+ // (might happen when discarding) -+ if (dst_rv) -+ av_frame_unref(frame); -+ -+ // If we got a frame this time ask for a pkt next time -+ s->req_pkt = (dst_rv == 0); -+ -+#if 0 -+ if (dst_rv == 0) -+ { -+ static int z = 0; -+ if (++z > 50) { -+ av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n"); -+ ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); -+ return -1; - } - } -+#endif -+ -+ return dst_rv == 0 ? 0 : -+ src_rv < 0 ? src_rv : -+ dst_rv < 0 ? dst_rv : -+ AVERROR(EAGAIN); -+} -+ -+#if 0 -+#include -+static int64_t us_time(void) -+{ -+ struct timespec ts; -+ clock_gettime(CLOCK_MONOTONIC, &ts); -+ return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000; -+} - --dequeue: -- return ff_v4l2_context_dequeue_frame(capture, frame, -1); --fail: -- av_packet_unref(&s->buf_pkt); -+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) -+{ -+ int ret; -+ const int64_t now = us_time(); -+ int64_t done; -+ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); -+ ret = v4l2_receive_frame2(avctx, frame); -+ done = us_time(); -+ av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret); - return ret; - } -+#endif - - static av_cold int v4l2_decode_init(AVCodecContext *avctx) - { -@@ -185,6 +457,9 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - V4L2m2mPriv *priv = avctx->priv_data; - int ret; - -+ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); -+ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; -+ - ret = ff_v4l2_m2m_create_context(priv, &s); - if (ret < 0) - return ret; -@@ -205,6 +480,28 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - capture->av_codec_id = AV_CODEC_ID_RAWVIDEO; - capture->av_pix_fmt = avctx->pix_fmt; - -+ /* the client requests the codec to generate DRM frames: -+ * - data[0] will therefore point to the returned AVDRMFrameDescriptor -+ * check the ff_v4l2_buffer_to_avframe conversion function. -+ * - the DRM frame format is passed in the DRM frame descriptor layer. -+ * check the v4l2_get_drm_frame function. -+ */ -+ switch (ff_get_format(avctx, avctx->codec->pix_fmts)) { -+ default: -+ s->output_drm = 1; -+ break; -+ } -+ -+ s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM); -+ if (!s->device_ref) { -+ ret = AVERROR(ENOMEM); -+ return ret; -+ } -+ -+ ret = av_hwdevice_ctx_init(s->device_ref); -+ if (ret < 0) -+ return ret; -+ - s->avctx = avctx; - ret = ff_v4l2_m2m_codec_init(priv); - if (ret) { -@@ -217,7 +514,53 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - - static av_cold int v4l2_decode_close(AVCodecContext *avctx) - { -- return ff_v4l2_m2m_codec_end(avctx->priv_data); -+ int rv; -+ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); -+ rv = ff_v4l2_m2m_codec_end(avctx->priv_data); -+ av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv); -+ return rv; -+} -+ -+static void v4l2_decode_flush(AVCodecContext *avctx) -+{ -+ // An alternatve and more drastic form of flush is to simply do this: -+ // v4l2_decode_close(avctx); -+ // v4l2_decode_init(avctx); -+ // The downside is that this keeps a decoder open until all the frames -+ // associated with it have been returned. This is a bit wasteful on -+ // possibly limited h/w resources and fails on a Pi for this reason unless -+ // more GPU mem is allocated than is the default. -+ -+ V4L2m2mPriv * const priv = avctx->priv_data; -+ V4L2m2mContext * const s = priv->context; -+ V4L2Context * const output = &s->output; -+ V4L2Context * const capture = &s->capture; -+ int ret, i; -+ -+ av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon); -+ -+ // Reflushing everything is benign, quick and avoids having to worry about -+ // states like EOS processing so don't try to optimize out (having got it -+ // wrong once) -+ -+ ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF); -+ if (ret < 0) -+ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret); -+ -+ // V4L2 makes no guarantees about whether decoded frames are flushed or not -+ // so mark all frames we are tracking to be discarded if they appear -+ for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) -+ s->track_els[i].discard = 1; -+ -+ // resend extradata -+ s->extdata_sent = 0; -+ // clear EOS status vars -+ s->draining = 0; -+ output->done = 0; -+ capture->done = 0; -+ -+ // Stream on will occur when we actually submit a new frame -+ av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__); - } - - #define OFFSET(x) offsetof(V4L2m2mPriv, x) -@@ -227,9 +570,15 @@ static const AVOption options[] = { - V4L_M2M_DEFAULT_OPTS, - { "num_capture_buffers", "Number of buffers in the capture context", - OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS }, -+ { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS }, - { NULL}, - }; - -+static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = { -+ HW_CONFIG_INTERNAL(DRM_PRIME), -+ NULL -+}; -+ - #define M2MDEC_CLASS(NAME) \ - static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \ - .class_name = #NAME "_v4l2m2m_decoder", \ -@@ -250,11 +599,16 @@ static const AVOption options[] = { - .init = v4l2_decode_init, \ - FF_CODEC_RECEIVE_FRAME_CB(v4l2_receive_frame), \ - .close = v4l2_decode_close, \ -+ .flush = v4l2_decode_flush, \ - .bsfs = bsf_name, \ - .p.capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \ - .caps_internal = FF_CODEC_CAP_NOT_INIT_THREADSAFE | \ - FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \ - .p.wrapper_name = "v4l2m2m", \ -+ .p.pix_fmts = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \ -+ AV_PIX_FMT_NV12, \ -+ AV_PIX_FMT_NONE}, \ -+ .hw_configs = v4l2_m2m_hw_configs, \ - } - - M2MDEC(h264, "H.264", AV_CODEC_ID_H264, "h264_mp4toannexb"); - -From 12f8f12326b83dd3c22084f8922705d79a13d195 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 10 Jun 2021 18:46:21 +0100 -Subject: [PATCH 017/136] Fix crash in hw_device_default_name if type not found - (NONE) - ---- - fftools/ffmpeg_hw.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/fftools/ffmpeg_hw.c b/fftools/ffmpeg_hw.c -index 88fa782470..740a5e7153 100644 ---- a/fftools/ffmpeg_hw.c -+++ b/fftools/ffmpeg_hw.c -@@ -75,6 +75,8 @@ static char *hw_device_default_name(enum AVHWDeviceType type) - char *name; - size_t index_pos; - int index, index_limit = 1000; -+ if (!type_name) -+ return NULL; - index_pos = strlen(type_name); - name = av_malloc(index_pos + 4); - if (!name) - -From 7f6bce459e683bff3a0b972922fbcc808e9177a6 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 10 Jun 2021 18:59:18 +0100 -Subject: [PATCH 018/136] Allow v4l2m2m to select non-drm_prime output formats - ---- - libavcodec/v4l2_buffers.c | 2 +- - libavcodec/v4l2_m2m_dec.c | 14 ++++++++++---- - 2 files changed, 11 insertions(+), 5 deletions(-) - -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index a003934ca1..1ca1128db6 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -524,7 +524,7 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) - offset += dst_stride * out->context->height; - } - if (offset > out->plane_info[0].length) { -- av_log(NULL, AV_LOG_ERROR, "%s: Plane total %d > buffer size %d\n", __func__, offset, out->plane_info[0].length); -+ av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length); - return -1; - } - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 7f6033ac2c..a4b5a4e7e9 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -455,10 +455,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - V4L2Context *capture, *output; - V4L2m2mContext *s; - V4L2m2mPriv *priv = avctx->priv_data; -+ int gf_pix_fmt; - int ret; - - av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); -- avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; - - ret = ff_v4l2_m2m_create_context(priv, &s); - if (ret < 0) -@@ -486,10 +486,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - * - the DRM frame format is passed in the DRM frame descriptor layer. - * check the v4l2_get_drm_frame function. - */ -- switch (ff_get_format(avctx, avctx->codec->pix_fmts)) { -- default: -+ -+ gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts); -+ av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n", -+ avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); -+ -+ s->output_drm = 0; -+ if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) { -+ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; - s->output_drm = 1; -- break; - } - - s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM); -@@ -607,6 +612,7 @@ static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = { - .p.wrapper_name = "v4l2m2m", \ - .p.pix_fmts = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \ - AV_PIX_FMT_NV12, \ -+ AV_PIX_FMT_YUV420P, \ - AV_PIX_FMT_NONE}, \ - .hw_configs = v4l2_m2m_hw_configs, \ - } - -From 9b0d964b727d98271f7f2f4dcdbcb1b41a429e2b Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 10 Jun 2021 18:59:38 +0100 -Subject: [PATCH 019/136] Fix YUV420P output from v4l2m2m - -Also put get_width get_height inlines in header as they are generally -useful. ---- - libavcodec/v4l2_buffers.c | 12 ++++++------ - libavcodec/v4l2_context.c | 22 ++++++---------------- - libavcodec/v4l2_m2m.h | 12 ++++++++++++ - 3 files changed, 24 insertions(+), 22 deletions(-) - -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 1ca1128db6..f4c11ca8d0 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -425,17 +425,17 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) - case AV_PIX_FMT_NV21: - if (avbuf->num_planes > 1) - break; -- frame->linesize[1] = avbuf->plane_info[0].bytesperline; -- frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height; -+ frame->linesize[1] = frame->linesize[0]; -+ frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format); - break; - - case AV_PIX_FMT_YUV420P: - if (avbuf->num_planes > 1) - break; -- frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1; -- frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1; -- frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height; -- frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2); -+ frame->linesize[1] = frame->linesize[0] / 2; -+ frame->linesize[2] = frame->linesize[1]; -+ frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format); -+ frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2; - break; - - default: -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index be76068af3..6fe2586627 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -55,16 +55,6 @@ static inline AVCodecContext *logger(V4L2Context *ctx) - return ctx_to_m2mctx(ctx)->avctx; - } - --static inline unsigned int v4l2_get_width(struct v4l2_format *fmt) --{ -- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; --} -- --static inline unsigned int v4l2_get_height(struct v4l2_format *fmt) --{ -- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; --} -- - static AVRational v4l2_get_sar(V4L2Context *ctx) - { - struct AVRational sar = { 0, 1 }; -@@ -96,8 +86,8 @@ static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2 - if (ret) - av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n", - ctx->name, -- v4l2_get_width(fmt1), v4l2_get_height(fmt1), -- v4l2_get_width(fmt2), v4l2_get_height(fmt2)); -+ ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1), -+ ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2)); - - return ret; - } -@@ -195,8 +185,8 @@ static int do_source_change(V4L2m2mContext * const s) - - reinit = v4l2_resolution_changed(&s->capture, &cap_fmt); - if (reinit) { -- s->capture.height = v4l2_get_height(&cap_fmt); -- s->capture.width = v4l2_get_width(&cap_fmt); -+ s->capture.height = ff_v4l2_get_format_height(&cap_fmt); -+ s->capture.width = ff_v4l2_get_format_width(&cap_fmt); - } - s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); - -@@ -973,8 +963,8 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers - av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name, - V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat), - req.count, -- v4l2_get_width(&ctx->format), -- v4l2_get_height(&ctx->format), -+ ff_v4l2_get_format_width(&ctx->format), -+ ff_v4l2_get_format_height(&ctx->format), - V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage, - V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline); - -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index 24a9c94864..8f054f2f50 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -160,4 +160,16 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx); - */ - int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx); - -+ -+static inline unsigned int ff_v4l2_get_format_width(struct v4l2_format *fmt) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; -+} -+ -+static inline unsigned int ff_v4l2_get_format_height(struct v4l2_format *fmt) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; -+} -+ -+ - #endif /* AVCODEC_V4L2_M2M_H */ - -From 14e9b4bf1b34b3d1e1e6a4fc755cc595416e7d7b Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 10 Jun 2021 19:23:44 +0100 -Subject: [PATCH 020/136] Report buffer overflows in v4l2m2m - ---- - libavcodec/v4l2_buffers.c | 14 ++++++++++---- - libavcodec/v4l2_context.c | 5 ++++- - 2 files changed, 14 insertions(+), 5 deletions(-) - -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index f4c11ca8d0..de31f7ced9 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -364,6 +364,7 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) - static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset) - { - unsigned int bytesused, length; -+ int rv = 0; - - if (plane >= out->num_planes) - return AVERROR(EINVAL); -@@ -371,11 +372,16 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i - length = out->plane_info[plane].length; - bytesused = FFMIN(size+offset, length); - -- memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset)); -+ if (size > length - offset) { -+ size = length - offset; -+ rv = AVERROR(ENOMEM); -+ } -+ -+ memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size); - - set_buf_length(out, plane, bytesused, length); - -- return 0; -+ return rv; - } - - static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf) -@@ -630,7 +636,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, - } - - ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen); -- if (ret) -+ if (ret && ret != AVERROR(ENOMEM)) - return ret; - - v4l2_set_pts(out, pkt->pts, no_rescale_pts); -@@ -638,7 +644,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, - if (pkt->flags & AV_PKT_FLAG_KEY) - out->flags = V4L2_BUF_FLAG_KEYFRAME; - -- return 0; -+ return ret; - } - - int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index 6fe2586627..81aced0c2b 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -824,7 +824,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, - return AVERROR(EAGAIN); - - ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts); -- if (ret) -+ if (ret == AVERROR(ENOMEM)) -+ av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n", -+ __func__, pkt->size, avbuf->planes[0].length); -+ else if (ret) - return ret; - - return ff_v4l2_buffer_enqueue(avbuf); - -From 072907a7fcf160d12972997d24fdf62641687ea4 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 14 Jun 2021 11:55:16 +0100 -Subject: [PATCH 021/136] Increase V4L2 H264 stateful coded buffer size - -Try to set a min size of frame size / 2 for bitbuffers passed to V4l2. -This fixes a few streams that have large I-frames. You would hope -Annex-A gave useful minCR so an appropriate size could be calculated -but it doesn't really. It gives good guidance for bits required over -time but the instantaneous limits are very weak so it is possible -that even this won't be enough. The correct long term solution would -be to have resizable dmabufs but that is a greter rewrite than seems -sensible now. ---- - libavcodec/v4l2_context.c | 24 +++++++++++++++++++++++- - libavcodec/v4l2_context.h | 6 ++++++ - libavcodec/v4l2_m2m_dec.c | 24 ++++++++++++++++++++++++ - 3 files changed, 53 insertions(+), 1 deletion(-) - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index 81aced0c2b..a17ae027a6 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -902,7 +902,29 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) - - int ff_v4l2_context_set_format(V4L2Context* ctx) - { -- return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); -+ int ret; -+ -+ ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); -+ if (ret != 0) -+ return ret; -+ -+ // Check returned size against min size and if smaller have another go -+ // Only worry about plane[0] as this is meant to enforce limits for -+ // encoded streams where we might know a bit more about the shape -+ // than the driver -+ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) { -+ if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage) -+ return 0; -+ ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size; -+ } -+ else { -+ if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage) -+ return 0; -+ ctx->format.fmt.pix.sizeimage = ctx->min_buf_size; -+ } -+ -+ ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); -+ return ret; - } - - void ff_v4l2_context_release(V4L2Context* ctx) -diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h -index 59009d11d1..37b0431400 100644 ---- a/libavcodec/v4l2_context.h -+++ b/libavcodec/v4l2_context.h -@@ -75,6 +75,12 @@ typedef struct V4L2Context { - AVRational sample_aspect_ratio; - struct v4l2_rect selection; - -+ /** -+ * If the default size of buffer is less than this then try to -+ * set to this. -+ */ -+ uint32_t min_buf_size; -+ - /** - * Indexed array of pointers to V4L2Buffers - */ -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index a4b5a4e7e9..1851acbc93 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -450,6 +450,27 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - } - #endif - -+static uint32_t max_coded_size(const AVCodecContext * const avctx) -+{ -+ uint32_t wxh = avctx->coded_width * avctx->coded_height; -+ uint32_t size; -+ -+ // Currently the only thing we try to set our own limits for is H264 -+ if (avctx->codec_id != AV_CODEC_ID_H264) -+ return 0; -+ -+ size = wxh * 3 / 2; -+ // H.264 Annex A table A-1 gives minCR which is either 2 or 4 -+ // unfortunately that doesn't yield an actually useful limit -+ // and it should be noted that frame 0 is special cased to allow -+ // a bigger number which really isn't helpful for us. So just pick -+ // frame_size / 2 -+ size /= 2; -+ // Add 64k to allow for any overheads and/or encoder hopefulness -+ // with small WxH -+ return size + (1 << 16); -+} -+ - static av_cold int v4l2_decode_init(AVCodecContext *avctx) - { - V4L2Context *capture, *output; -@@ -460,6 +481,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - - av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); - -+ av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level); - ret = ff_v4l2_m2m_create_context(priv, &s); - if (ret < 0) - return ret; -@@ -476,9 +498,11 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - - output->av_codec_id = avctx->codec_id; - output->av_pix_fmt = AV_PIX_FMT_NONE; -+ output->min_buf_size = max_coded_size(avctx); - - capture->av_codec_id = AV_CODEC_ID_RAWVIDEO; - capture->av_pix_fmt = avctx->pix_fmt; -+ capture->min_buf_size = 0; - - /* the client requests the codec to generate DRM frames: - * - data[0] will therefore point to the returned AVDRMFrameDescriptor - -From 6087c8c054e1ff3d2e6e62d5e32705d079928b64 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 28 Jun 2021 12:13:35 +0100 -Subject: [PATCH 022/136] Fix raw video s.t. it respects any remaining cropping - -This fixes the long standing CONFWIN_A conformance test failure for drm. ---- - libavcodec/rawenc.c | 32 ++++++++--- - libavutil/hwcontext_drm.c | 112 ++++++++++++++++++++++++++++++++++++-- - 2 files changed, 130 insertions(+), 14 deletions(-) - -diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c -index 594a77c42a..8ca0379e12 100644 ---- a/libavcodec/rawenc.c -+++ b/libavcodec/rawenc.c -@@ -124,32 +124,41 @@ static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, - - - static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, -- const AVFrame *frame, int *got_packet) -+ const AVFrame *src_frame, int *got_packet) - { - int ret; -+ AVFrame * frame = NULL; - - #if CONFIG_SAND -- if (av_rpi_is_sand_frame(frame)) { -- ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : -- av_rpi_is_sand16_frame(frame) ? raw_sand16_as_yuv420(avctx, pkt, frame) : -- av_rpi_is_sand30_frame(frame) ? raw_sand30_as_yuv420(avctx, pkt, frame) : -1; -+ if (av_rpi_is_sand_frame(src_frame)) { -+ ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) : -+ av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) : -+ av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1; - *got_packet = (ret == 0); - return ret; - } - #endif - -+ if ((frame = av_frame_clone(src_frame)) == NULL) { -+ ret = AVERROR(ENOMEM); -+ goto fail; -+ } -+ -+ if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0) -+ goto fail; -+ - ret = av_image_get_buffer_size(frame->format, - frame->width, frame->height, 1); - if (ret < 0) -- return ret; -+ goto fail; - - if ((ret = ff_get_encode_buffer(avctx, pkt, ret, 0)) < 0) -- return ret; -+ goto fail; - if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, - (const uint8_t **)frame->data, frame->linesize, - frame->format, - frame->width, frame->height, 1)) < 0) -- return ret; -+ goto fail; - - if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 && - frame->format == AV_PIX_FMT_YUYV422) { -@@ -165,8 +174,15 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, - AV_WB64(&pkt->data[8 * x], v << 48 | v >> 16); - } - } -+ pkt->flags |= AV_PKT_FLAG_KEY; -+ av_frame_free(&frame); - *got_packet = 1; - return 0; -+ -+fail: -+ av_frame_free(&frame); -+ *got_packet = 0; -+ return ret; - } - - const FFCodec ff_rawvideo_encoder = { -diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c -index 7a9fdbd263..baf18920fa 100644 ---- a/libavutil/hwcontext_drm.c -+++ b/libavutil/hwcontext_drm.c -@@ -21,6 +21,7 @@ - #include - #include - #include -+#include - - /* This was introduced in version 4.6. And may not exist all without an - * optional package. So to prevent a hard dependency on needing the Linux -@@ -31,6 +32,7 @@ - #endif - - #include -+#include - #include - - #include "avassert.h" -@@ -38,7 +40,9 @@ - #include "hwcontext_drm.h" - #include "hwcontext_internal.h" - #include "imgutils.h" -- -+#if CONFIG_SAND -+#include "libavutil/rpi_sand_fns.h" -+#endif - - static void drm_device_free(AVHWDeviceContext *hwdev) - { -@@ -53,6 +57,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device, - AVDRMDeviceContext *hwctx = hwdev->hwctx; - drmVersionPtr version; - -+ if (device == NULL) { -+ hwctx->fd = -1; -+ return 0; -+ } -+ - hwctx->fd = open(device, O_RDWR); - if (hwctx->fd < 0) - return AVERROR(errno); -@@ -139,6 +148,8 @@ static int drm_map_frame(AVHWFramesContext *hwfc, - if (flags & AV_HWFRAME_MAP_WRITE) - mmap_prot |= PROT_WRITE; - -+ if (dst->format == AV_PIX_FMT_NONE) -+ dst->format = hwfc->sw_format; - #if HAVE_LINUX_DMA_BUF_H - if (flags & AV_HWFRAME_MAP_READ) - map->sync_flags |= DMA_BUF_SYNC_READ; -@@ -185,6 +196,23 @@ static int drm_map_frame(AVHWFramesContext *hwfc, - - dst->width = src->width; - dst->height = src->height; -+ dst->crop_top = src->crop_top; -+ dst->crop_bottom = src->crop_bottom; -+ dst->crop_left = src->crop_left; -+ dst->crop_right = src->crop_right; -+ -+#if CONFIG_SAND -+ // Rework for sand frames -+ if (av_rpi_is_sand_frame(dst)) { -+ // As it stands the sand formats hold stride2 in linesize[3] -+ // linesize[0] & [1] contain stride1 which is always 128 for everything we do -+ // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1] -+ dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier); -+ dst->linesize[0] = 128; -+ dst->linesize[1] = 128; -+ // *** Are we sure src->height is actually what we want ??? -+ } -+#endif - - err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src, - &drm_unmap_frame, map); -@@ -212,7 +240,15 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx, - if (!pix_fmts) - return AVERROR(ENOMEM); - -- pix_fmts[0] = ctx->sw_format; -+ // **** Offer native sand too ???? -+ pix_fmts[0] = -+#if CONFIG_SAND -+ ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ? -+ AV_PIX_FMT_YUV420P : -+ ctx->sw_format == AV_PIX_FMT_RPI4_10 ? -+ AV_PIX_FMT_YUV420P10LE : -+#endif -+ ctx->sw_format; - pix_fmts[1] = AV_PIX_FMT_NONE; - - *formats = pix_fmts; -@@ -231,18 +267,79 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc, - map = av_frame_alloc(); - if (!map) - return AVERROR(ENOMEM); -- map->format = dst->format; - -+ // Map to default -+ map->format = AV_PIX_FMT_NONE; - err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ); - if (err) - goto fail; - -- map->width = dst->width; -- map->height = dst->height; -+#if 0 -+ av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__, -+ hwfc->sw_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE, -+ map->width, map->height, -+ map->linesize[0], -+ map->linesize[1], -+ map->linesize[2], -+ map->linesize[3], -+ dst->width, dst->height, -+ dst->linesize[0], -+ dst->linesize[1], -+ dst->linesize[2]); -+#endif -+#if CONFIG_SAND -+ if (av_rpi_is_sand_frame(map)) { -+ // Preserve crop - later ffmpeg code assumes that we have in that it -+ // overwrites any crop that we create with the old values -+ const unsigned int w = FFMIN(dst->width, map->width); -+ const unsigned int h = FFMIN(dst->height, map->height); -+ -+ if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) { -+ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], -+ map->data[0], -+ 128, stride2, -+ 0, 0, w, h); -+ av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1], -+ dst->data[2], dst->linesize[2], -+ map->data[1], -+ 128, stride2, -+ 0, 0, w / 2, h / 2); -+ } -+ else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) { -+ av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0], -+ map->data[0], -+ 128, stride2, -+ 0, 0, w, h); -+ av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1], -+ dst->data[2], dst->linesize[2], -+ map->data[1], -+ 128, stride2, -+ 0, 0, w / 2, h / 2); -+ } -+ else -+ { -+ av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__); -+ err = AVERROR(EINVAL); -+ goto fail; -+ } -+ -+ dst->width = w; -+ dst->height = h; -+ } -+ else -+#endif -+ { -+ // Kludge mapped h/w s.t. frame_copy works -+ map->width = dst->width; -+ map->height = dst->height; -+ err = av_frame_copy(dst, map); -+ } - -- err = av_frame_copy(dst, map); - if (err) -+ { -+ av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__); - goto fail; -+ } - - err = 0; - fail: -@@ -257,7 +354,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc, - int err; - - if (src->width > hwfc->width || src->height > hwfc->height) -+ { -+ av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height); - return AVERROR(EINVAL); -+ } - - map = av_frame_alloc(); - if (!map) - -From 597858c11fbfbe0f54c1b68d9683025929258bc1 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Fri, 13 Aug 2021 15:38:28 +0100 -Subject: [PATCH 023/136] Set frame interlace from V4L2 buffer field - ---- - libavcodec/v4l2_buffers.c | 12 ++++++++++++ - 1 file changed, 12 insertions(+) - -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index de31f7ced9..97b8eb1db3 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -222,6 +222,16 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf) - return AVCOL_TRC_UNSPECIFIED; - } - -+static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf) -+{ -+ return V4L2_FIELD_IS_INTERLACED(buf->buf.field); -+} -+ -+static int v4l2_buf_is_top_first(const V4L2Buffer * const buf) -+{ -+ return buf->buf.field == V4L2_FIELD_INTERLACED_TB; -+} -+ - static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf) - { - AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; -@@ -576,6 +586,8 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_resc - frame->color_trc = v4l2_get_color_trc(avbuf); - frame->pts = v4l2_get_pts(avbuf, no_rescale_pts); - frame->pkt_dts = AV_NOPTS_VALUE; -+ frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf); -+ frame->top_field_first = v4l2_buf_is_top_first(avbuf); - - /* these values are updated also during re-init in v4l2_process_driver_event */ - frame->height = ctx->height; - -From 05906e2086b5087d615485ec9a09b1493dbb32e1 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Fri, 13 Aug 2021 16:11:53 +0100 -Subject: [PATCH 024/136] Fix V4L2 stateful to avoid crash if flush before - start - ---- - libavcodec/v4l2_context.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index a17ae027a6..eb901e8fab 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -713,6 +713,10 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p) - static void flush_all_buffers_status(V4L2Context* const ctx) - { - int i; -+ -+ if (!ctx->bufrefs) -+ return; -+ - for (i = 0; i < ctx->num_buffers; ++i) { - struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; - if (buf->status == V4L2BUF_IN_DRIVER) - -From 7157b6032e759078a7d751e5dd5762970f3d1e8c Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 9 Sep 2021 17:44:13 +0100 -Subject: [PATCH 025/136] Copy properties from frame to v4l2 buffer - -Now copies all the properties in ff_v4l2_buffer_avframe_to_buf that -ff_v4l2_buffer_buf_to_avframe copies ---- - libavcodec/v4l2_buffers.c | 126 ++++++++++++++++++++++++++++++++++++++ - 1 file changed, 126 insertions(+) - -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 97b8eb1db3..126d2a17f4 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -128,6 +128,105 @@ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf) - return AVCOL_PRI_UNSPECIFIED; - } - -+static void v4l2_set_color(V4L2Buffer *buf, -+ const enum AVColorPrimaries avcp, -+ const enum AVColorSpace avcs, -+ const enum AVColorTransferCharacteristic avxc) -+{ -+ enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT; -+ enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT; -+ enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT; -+ -+ switch (avcp) { -+ case AVCOL_PRI_BT709: -+ cs = V4L2_COLORSPACE_REC709; -+ ycbcr = V4L2_YCBCR_ENC_709; -+ break; -+ case AVCOL_PRI_BT470M: -+ cs = V4L2_COLORSPACE_470_SYSTEM_M; -+ ycbcr = V4L2_YCBCR_ENC_601; -+ break; -+ case AVCOL_PRI_BT470BG: -+ cs = V4L2_COLORSPACE_470_SYSTEM_BG; -+ break; -+ case AVCOL_PRI_SMPTE170M: -+ cs = V4L2_COLORSPACE_SMPTE170M; -+ break; -+ case AVCOL_PRI_SMPTE240M: -+ cs = V4L2_COLORSPACE_SMPTE240M; -+ break; -+ case AVCOL_PRI_BT2020: -+ cs = V4L2_COLORSPACE_BT2020; -+ break; -+ case AVCOL_PRI_SMPTE428: -+ case AVCOL_PRI_SMPTE431: -+ case AVCOL_PRI_SMPTE432: -+ case AVCOL_PRI_EBU3213: -+ case AVCOL_PRI_RESERVED: -+ case AVCOL_PRI_FILM: -+ case AVCOL_PRI_UNSPECIFIED: -+ default: -+ break; -+ } -+ -+ switch (avcs) { -+ case AVCOL_SPC_RGB: -+ cs = V4L2_COLORSPACE_SRGB; -+ break; -+ case AVCOL_SPC_BT709: -+ cs = V4L2_COLORSPACE_REC709; -+ break; -+ case AVCOL_SPC_FCC: -+ cs = V4L2_COLORSPACE_470_SYSTEM_M; -+ break; -+ case AVCOL_SPC_BT470BG: -+ cs = V4L2_COLORSPACE_470_SYSTEM_BG; -+ break; -+ case AVCOL_SPC_SMPTE170M: -+ cs = V4L2_COLORSPACE_SMPTE170M; -+ break; -+ case AVCOL_SPC_SMPTE240M: -+ cs = V4L2_COLORSPACE_SMPTE240M; -+ break; -+ case AVCOL_SPC_BT2020_CL: -+ cs = V4L2_COLORSPACE_BT2020; -+ ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM; -+ break; -+ case AVCOL_SPC_BT2020_NCL: -+ cs = V4L2_COLORSPACE_BT2020; -+ break; -+ default: -+ break; -+ } -+ -+ switch (xfer) { -+ case AVCOL_TRC_BT709: -+ xfer = V4L2_XFER_FUNC_709; -+ break; -+ case AVCOL_TRC_IEC61966_2_1: -+ xfer = V4L2_XFER_FUNC_SRGB; -+ break; -+ case AVCOL_TRC_SMPTE240M: -+ xfer = V4L2_XFER_FUNC_SMPTE240M; -+ break; -+ case AVCOL_TRC_SMPTE2084: -+ xfer = V4L2_XFER_FUNC_SMPTE2084; -+ break; -+ default: -+ break; -+ } -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) { -+ buf->context->format.fmt.pix_mp.colorspace = cs; -+ buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr; -+ buf->context->format.fmt.pix_mp.xfer_func = xfer; -+ } else { -+ buf->context->format.fmt.pix.colorspace = cs; -+ buf->context->format.fmt.pix.ycbcr_enc = ycbcr; -+ buf->context->format.fmt.pix.xfer_func = xfer; -+ } -+} -+ - static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf) - { - enum v4l2_quantization qt; -@@ -146,6 +245,20 @@ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf) - return AVCOL_RANGE_UNSPECIFIED; - } - -+static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr) -+{ -+ const enum v4l2_quantization q = -+ avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE : -+ avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE : -+ V4L2_QUANTIZATION_DEFAULT; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) { -+ buf->context->format.fmt.pix_mp.quantization = q; -+ } else { -+ buf->context->format.fmt.pix.quantization = q; -+ } -+} -+ - static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf) - { - enum v4l2_ycbcr_encoding ycbcr; -@@ -232,6 +345,12 @@ static int v4l2_buf_is_top_first(const V4L2Buffer * const buf) - return buf->buf.field == V4L2_FIELD_INTERLACED_TB; - } - -+static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff) -+{ -+ buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE : -+ is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT; -+} -+ - static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf) - { - AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; -@@ -561,7 +680,14 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) - - int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) - { -+ out->buf.flags = frame->key_frame ? (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME) : (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME); -+ // Beware that colour info is held in format rather than the actual -+ // v4l2 buffer struct so this may not be as useful as you might hope -+ v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc); -+ v4l2_set_color_range(out, frame->color_range); -+ // PTS & interlace are buffer vars - v4l2_set_pts(out, frame->pts, 0); -+ v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first); - - return v4l2_buffer_swframe_to_buf(frame, out); - } - -From 15415ab226f966fd12e70d79fde3cb80f3d09144 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 17 Nov 2021 16:49:01 +0000 -Subject: [PATCH 026/136] ffmpeg: Do not inc DTS on no decode output - -V4L2 H264 decode has long latency and sometimes spits out a long stream -of output without input. In this case incrementing DTS is wrong. There -may be cases where the condition as written is correct so only "fix" in -the cases which cause problems ---- - fftools/ffmpeg.c | 7 ++++++- - 1 file changed, 6 insertions(+), 1 deletion(-) - -diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c -index 5dc2cd73c1..ba0c1898cf 100644 ---- a/fftools/ffmpeg.c -+++ b/fftools/ffmpeg.c -@@ -2609,7 +2609,12 @@ static int process_input_packet(InputStream *ist, const AVPacket *pkt, int no_eo - case AVMEDIA_TYPE_VIDEO: - ret = decode_video (ist, repeating ? NULL : avpkt, &got_output, &duration_pts, !pkt, - &decode_failed); -- if (!repeating || !pkt || got_output) { -+ // Pi: Do not inc dts if no_cvt_hw set -+ // V4L2 H264 decode has long latency and sometimes spits out a long -+ // stream of output without input. In this case incrementing DTS is wrong. -+ // There may be cases where the condition as written is correct so only -+ // "fix" in the cases which cause problems -+ if (!repeating || !pkt || (got_output && !no_cvt_hw)) { - if (pkt && pkt->duration) { - duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q); - } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) { - -From 7bf6c062ed8a1e635aa5722c0072724f236daf00 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 17 Nov 2021 17:32:59 +0000 -Subject: [PATCH 027/136] v4l2_m2m_dec: Adjust timebase if H264 - -Adjust AVCodecContext time_base if H264 in the same way that the -software decoder does. ---- - libavcodec/v4l2_m2m_dec.c | 10 ++++++++++ - 1 file changed, 10 insertions(+) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 1851acbc93..aa1e5c1597 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -481,6 +481,16 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - - av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); - -+ if (avctx->codec_id == AV_CODEC_ID_H264) { -+ if (avctx->ticks_per_frame == 1) { -+ if(avctx->time_base.den < INT_MAX/2) { -+ avctx->time_base.den *= 2; -+ } else -+ avctx->time_base.num /= 2; -+ } -+ avctx->ticks_per_frame = 2; -+ } -+ - av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level); - ret = ff_v4l2_m2m_create_context(priv, &s); - if (ret < 0) - -From 3cd23a761397ae75ed032c1687da5d6b76ddaaaa Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 17 Nov 2021 17:38:27 +0000 -Subject: [PATCH 028/136] v4l2_m2m_dec: Produce best guess PTSs if none - supplied - -Filter scheduling gets confused by missing PTSs and makes poor guesses -more often than not. Try to generate plausible timestamps where we are -missing them. ---- - libavcodec/v4l2_m2m.h | 12 ++++++++ - libavcodec/v4l2_m2m_dec.c | 64 +++++++++++++++++++++++++++++++++++++-- - 2 files changed, 74 insertions(+), 2 deletions(-) - -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index 8f054f2f50..82feb0afdb 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -52,6 +52,16 @@ typedef struct V4L2m2mTrackEl { - int64_t track_pts; - } V4L2m2mTrackEl; - -+typedef struct pts_stats_s -+{ -+ void * logctx; -+ const char * name; // For debug -+ unsigned int last_count; -+ unsigned int last_interval; -+ int64_t last_pts; -+ int64_t guess; -+} pts_stats_t; -+ - typedef struct V4L2m2mContext { - char devname[PATH_MAX]; - int fd; -@@ -91,6 +101,8 @@ typedef struct V4L2m2mContext { - unsigned int track_no; - V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; - -+ pts_stats_t pts_stat; -+ - /* req pkt */ - int req_pkt; - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index aa1e5c1597..a5a2afbd27 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -42,6 +42,62 @@ - #include "v4l2_m2m.h" - #include "v4l2_fmt.h" - -+// Pick 64 for max last count - that is >1sec at 60fps -+#define STATS_LAST_COUNT_MAX 64 -+#define STATS_INTERVAL_MAX (1 << 30) -+ -+static int64_t pts_stats_guess(const pts_stats_t * const stats) -+{ -+ if (stats->last_pts == AV_NOPTS_VALUE || -+ stats->last_interval == 0 || -+ stats->last_count >= STATS_LAST_COUNT_MAX) -+ return AV_NOPTS_VALUE; -+ return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval; -+} -+ -+static void pts_stats_add(pts_stats_t * const stats, int64_t pts) -+{ -+ if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) { -+ if (stats->last_count < STATS_LAST_COUNT_MAX) -+ ++stats->last_count; -+ return; -+ } -+ -+ if (stats->last_pts != AV_NOPTS_VALUE) { -+ const int64_t interval = pts - stats->last_pts; -+ -+ if (interval < 0 || interval >= STATS_INTERVAL_MAX || -+ stats->last_count >= STATS_LAST_COUNT_MAX) { -+ if (stats->last_interval != 0) -+ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n", -+ __func__, stats->name, interval, stats->last_count); -+ stats->last_interval = 0; -+ } -+ else { -+ const int64_t frame_time = interval / (int64_t)stats->last_count; -+ -+ if (frame_time != stats->last_interval) -+ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n", -+ __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time); -+ stats->last_interval = frame_time; -+ } -+ } -+ -+ stats->last_pts = pts; -+ stats->last_count = 1; -+} -+ -+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name) -+{ -+ *stats = (pts_stats_t){ -+ .logctx = logctx, -+ .name = name, -+ .last_count = 1, -+ .last_interval = 0, -+ .last_pts = AV_NOPTS_VALUE -+ }; -+} -+ - static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s) - { - int ret; -@@ -244,9 +300,11 @@ xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *cons - return -1; - } - -- frame->best_effort_timestamp = frame->pts; -+ pts_stats_add(&s->pts_stat, frame->pts); -+ -+ frame->best_effort_timestamp = pts_stats_guess(&s->pts_stat); - frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? -- av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 ", DTS=%" PRId64 "\n", frame->pts, frame->pkt_dts); -+ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts); - return 0; - } - -@@ -496,6 +554,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - if (ret < 0) - return ret; - -+ pts_stats_init(&s->pts_stat, avctx, "decoder"); -+ - capture = &s->capture; - output = &s->output; - - -From ee8be1e900f98212b6c4940980cc7a80becfc07c Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 17 Nov 2021 17:59:27 +0000 -Subject: [PATCH 029/136] v4l2_m2m_dec: Try harder to get an initial frame - -If the input Q is full then wait on a short timeout for a capture frame -rather than stuffing yet still another frame into the input if we could -do that first. This attempts to restrict the sometimes daft initial -buffering that ends up confusing the rest of the system. ---- - libavcodec/v4l2_context.c | 2 +- - libavcodec/v4l2_m2m_dec.c | 2 +- - 2 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index eb901e8fab..ee5dc7b8d4 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -381,7 +381,7 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) - start: - if (is_capture) { - /* no need to listen to requests for more input while draining */ -- if (ctx_to_m2mctx(ctx)->draining) -+ if (ctx_to_m2mctx(ctx)->draining || timeout > 0) - pfd.events = POLLIN | POLLRDNORM | POLLPRI; - } else { - pfd.events = POLLOUT | POLLWRNORM; -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index a5a2afbd27..b49f470c0a 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -442,7 +442,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - // when discarding - // This returns AVERROR(EAGAIN) if there isn't a frame ready yet - // but there is room in the input Q -- dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, -1, 1); -+ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1, 1); - - if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) - av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", - -From 72da14331c2160a12b69d666d493e0e74c5e8914 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 17 Nov 2021 18:04:56 +0000 -Subject: [PATCH 030/136] Add a V4L2 M2M deinterlace filter - -Add a V4L2 deinterlace filter that will accept DRMPRIME frames. - -Multiple people have contributed to this: -Jernej Skrabec -Alex Bee -popcornmix -John Cox - -There is an unknown delay through the filter of typically one or three -fields which translates to 1 or 2 frames. Frames that are delayed are -lost at end of stream as the V4L2 filter has no flush control. ---- - libavcodec/v4l2_context.c | 4 +- - libavfilter/Makefile | 1 + - libavfilter/allfilters.c | 1 + - libavfilter/vf_deinterlace_v4l2m2m.c | 1269 ++++++++++++++++++++++++++ - 4 files changed, 1273 insertions(+), 2 deletions(-) - create mode 100644 libavfilter/vf_deinterlace_v4l2m2m.c - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index ee5dc7b8d4..440dfaaba5 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -498,10 +498,10 @@ dequeue: - return NULL; - } - --ctx->q_count; -- av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d\n", -+ av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d field=%d\n", - ctx->name, buf.index, - buf.timestamp.tv_sec, buf.timestamp.tv_usec, -- ctx->q_count, ++ctx->dq_count); -+ ctx->q_count, ++ctx->dq_count, buf.field); - - avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; - avbuf->status = V4L2BUF_AVAILABLE; -diff --git a/libavfilter/Makefile b/libavfilter/Makefile -index c14fc995a0..0e7b5856bd 100644 ---- a/libavfilter/Makefile -+++ b/libavfilter/Makefile -@@ -262,6 +262,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER) += vf_neighbor.o - OBJS-$(CONFIG_DEFLICKER_FILTER) += vf_deflicker.o - OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER) += vf_vpp_qsv.o - OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER) += vf_deinterlace_vaapi.o vaapi_vpp.o -+OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER) += vf_deinterlace_v4l2m2m.o - OBJS-$(CONFIG_DEJUDDER_FILTER) += vf_dejudder.o - OBJS-$(CONFIG_DELOGO_FILTER) += vf_delogo.o - OBJS-$(CONFIG_DENOISE_VAAPI_FILTER) += vf_misc_vaapi.o vaapi_vpp.o -diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c -index b990a00152..357ff61ca8 100644 ---- a/libavfilter/allfilters.c -+++ b/libavfilter/allfilters.c -@@ -248,6 +248,7 @@ extern const AVFilter ff_vf_derain; - extern const AVFilter ff_vf_deshake; - extern const AVFilter ff_vf_deshake_opencl; - extern const AVFilter ff_vf_despill; -+extern const AVFilter ff_vf_deinterlace_v4l2m2m; - extern const AVFilter ff_vf_detelecine; - extern const AVFilter ff_vf_dilation; - extern const AVFilter ff_vf_dilation_opencl; -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -new file mode 100644 -index 0000000000..1a933b7e0a ---- /dev/null -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -0,0 +1,1269 @@ -+/* -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/** -+ * @file -+ * deinterlace video filter - V4L2 M2M -+ */ -+ -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "libavutil/avassert.h" -+#include "libavutil/avstring.h" +#include "libavutil/common.h" -+#include "libavutil/hwcontext.h" -+#include "libavutil/hwcontext_drm.h" +#include "libavutil/internal.h" -+#include "libavutil/mathematics.h" -+#include "libavutil/opt.h" -+#include "libavutil/pixdesc.h" -+#include "libavutil/time.h" -+ -+#define FF_INTERNAL_FIELDS 1 -+#include "framequeue.h" -+#include "filters.h" -+#include "avfilter.h" -+#include "formats.h" -+#include "internal.h" -+#include "video.h" -+ -+typedef struct V4L2Queue V4L2Queue; -+typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared; -+ -+typedef struct V4L2PlaneInfo { -+ int bytesperline; -+ size_t length; -+} V4L2PlaneInfo; -+ -+typedef struct V4L2Buffer { -+ int enqueued; -+ int reenqueue; -+ int fd; -+ struct v4l2_buffer buffer; -+ AVFrame frame; -+ struct v4l2_plane planes[VIDEO_MAX_PLANES]; -+ int num_planes; -+ V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES]; -+ AVDRMFrameDescriptor drm_frame; -+ V4L2Queue *q; -+} V4L2Buffer; -+ -+typedef struct V4L2Queue { -+ struct v4l2_format format; -+ int num_buffers; -+ V4L2Buffer *buffers; -+ DeintV4L2M2MContextShared *ctx; -+} V4L2Queue; -+ -+typedef struct pts_stats_s -+{ -+ void * logctx; -+ const char * name; // For debug -+ unsigned int last_count; -+ unsigned int last_interval; -+ int64_t last_pts; -+} pts_stats_t; -+ -+#define PTS_TRACK_SIZE 32 -+typedef struct pts_track_el_s -+{ -+ uint32_t n; -+ unsigned int interval; -+ AVFrame * props; -+} pts_track_el_t; -+ -+typedef struct pts_track_s -+{ -+ uint32_t n; -+ uint32_t last_n; -+ int got_2; -+ void * logctx; -+ pts_stats_t stats; -+ pts_track_el_t a[PTS_TRACK_SIZE]; -+} pts_track_t; -+ -+typedef struct DeintV4L2M2MContextShared { -+ void * logctx; // For logging - will be NULL when done -+ -+ int fd; -+ int done; -+ int width; -+ int height; -+ int orig_width; -+ int orig_height; -+ atomic_uint refcount; -+ -+ AVBufferRef *hw_frames_ctx; -+ -+ unsigned int field_order; -+ -+ pts_track_t track; -+ -+ V4L2Queue output; -+ V4L2Queue capture; -+} DeintV4L2M2MContextShared; -+ -+typedef struct DeintV4L2M2MContext { -+ const AVClass *class; -+ -+ DeintV4L2M2MContextShared *shared; -+} DeintV4L2M2MContext; -+ -+static unsigned int pts_stats_interval(const pts_stats_t * const stats) -+{ -+ return stats->last_interval; -+} -+ -+// Pick 64 for max last count - that is >1sec at 60fps -+#define STATS_LAST_COUNT_MAX 64 -+#define STATS_INTERVAL_MAX (1 << 30) -+static void pts_stats_add(pts_stats_t * const stats, int64_t pts) -+{ -+ if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) { -+ if (stats->last_count < STATS_LAST_COUNT_MAX) -+ ++stats->last_count; -+ return; -+ } -+ -+ if (stats->last_pts != AV_NOPTS_VALUE) { -+ const int64_t interval = pts - stats->last_pts; -+ -+ if (interval < 0 || interval >= STATS_INTERVAL_MAX || -+ stats->last_count >= STATS_LAST_COUNT_MAX) { -+ if (stats->last_interval != 0) -+ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n", -+ __func__, stats->name, interval, stats->last_count); -+ stats->last_interval = 0; -+ } -+ else { -+ const int64_t frame_time = interval / (int64_t)stats->last_count; -+ -+ if (frame_time != stats->last_interval) -+ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n", -+ __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time); -+ stats->last_interval = frame_time; -+ } -+ } -+ -+ stats->last_pts = pts; -+ stats->last_count = 1; -+} -+ -+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name) -+{ -+ *stats = (pts_stats_t){ -+ .logctx = logctx, -+ .name = name, -+ .last_count = 1, -+ .last_interval = 0, -+ .last_pts = AV_NOPTS_VALUE -+ }; -+} -+ -+static inline uint32_t pts_track_next_n(pts_track_t * const trk) -+{ -+ if (++trk->n == 0) -+ trk->n = 1; -+ return trk->n; -+} -+ -+static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst) -+{ -+ uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000); -+ pts_track_el_t * t; -+ -+ // As a first guess assume that n==0 means last frame -+ if (n == 0) { -+ n = trk->last_n; -+ if (n == 0) -+ goto fail; -+ } -+ -+ t = trk->a + (n & (PTS_TRACK_SIZE - 1)); -+ -+ if (t->n != n) { -+ av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n); -+ goto fail; -+ } -+ -+ // 1st frame is simple - just believe it -+ if (n != trk->last_n) { -+ trk->last_n = n; -+ trk->got_2 = 0; -+ return av_frame_copy_props(dst, t->props); -+ } -+ -+ // Only believe in a single interpolated frame -+ if (trk->got_2) -+ goto fail; -+ trk->got_2 = 1; -+ -+ av_frame_copy_props(dst, t->props); -+ -+ -+ // If we can't guess - don't -+ if (t->interval == 0) { -+ dst->best_effort_timestamp = AV_NOPTS_VALUE; -+ dst->pts = AV_NOPTS_VALUE; -+ dst->pkt_dts = AV_NOPTS_VALUE; -+ } -+ else { -+ if (dst->best_effort_timestamp != AV_NOPTS_VALUE) -+ dst->best_effort_timestamp += t->interval / 2; -+ if (dst->pts != AV_NOPTS_VALUE) -+ dst->pts += t->interval / 2; -+ if (dst->pkt_dts != AV_NOPTS_VALUE) -+ dst->pkt_dts += t->interval / 2; -+ } -+ -+ return 0; -+ -+fail: -+ trk->last_n = 0; -+ trk->got_2 = 0; -+ dst->pts = AV_NOPTS_VALUE; -+ dst->pkt_dts = AV_NOPTS_VALUE; -+ return 0; -+} -+ -+static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src) -+{ -+ const uint32_t n = pts_track_next_n(trk); -+ pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1)); -+ -+ pts_stats_add(&trk->stats, src->pts); -+ -+ t->n = n; -+ t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last -+ av_frame_unref(t->props); -+ av_frame_copy_props(t->props, src); -+ -+ // We now know what the previous interval was, rather than having to guess, -+ // so set it. There is a better than decent chance that this is before -+ // we use it. -+ if (t->interval != 0) { -+ pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1)); -+ prev_t->interval = t->interval; -+ } -+ -+ // In case deinterlace interpolates frames use every other usec -+ return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2}; -+} -+ -+static void pts_track_uninit(pts_track_t * const trk) -+{ -+ unsigned int i; -+ for (i = 0; i != PTS_TRACK_SIZE; ++i) { -+ trk->a[i].n = 0; -+ av_frame_free(&trk->a[i].props); -+ } -+} -+ -+static int pts_track_init(pts_track_t * const trk, void *logctx) -+{ -+ unsigned int i; -+ trk->n = 1; -+ pts_stats_init(&trk->stats, logctx, "track"); -+ for (i = 0; i != PTS_TRACK_SIZE; ++i) { -+ trk->a[i].n = 0; -+ if ((trk->a[i].props = av_frame_alloc()) == NULL) { -+ pts_track_uninit(trk); -+ return AVERROR(ENOMEM); -+ } -+ } -+ return 0; -+} -+ -+static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) -+{ -+ struct v4l2_capability cap; -+ int ret; -+ -+ memset(&cap, 0, sizeof(cap)); -+ ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap); -+ if (ret < 0) -+ return ret; -+ -+ if (!(cap.capabilities & V4L2_CAP_STREAMING)) -+ return AVERROR(EINVAL); -+ -+ if (cap.capabilities & V4L2_CAP_VIDEO_M2M) { -+ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; -+ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; -+ -+ return 0; -+ } -+ -+ if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) { -+ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; -+ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; -+ -+ return 0; -+ } -+ -+ return AVERROR(EINVAL); -+} -+ -+static int deint_v4l2m2m_try_format(V4L2Queue *queue) -+{ -+ struct v4l2_format *fmt = &queue->format; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ int ret, field; -+ -+ ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt); -+ if (ret) -+ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret); -+ -+ if (V4L2_TYPE_IS_OUTPUT(fmt->type)) -+ field = V4L2_FIELD_INTERLACED_TB; -+ else -+ field = V4L2_FIELD_NONE; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420; -+ fmt->fmt.pix_mp.field = field; -+ fmt->fmt.pix_mp.width = ctx->width; -+ fmt->fmt.pix_mp.height = ctx->height; -+ } else { -+ fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420; -+ fmt->fmt.pix.field = field; -+ fmt->fmt.pix.width = ctx->width; -+ fmt->fmt.pix.height = ctx->height; -+ } -+ -+ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__, -+ fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height, -+ fmt->fmt.pix_mp.pixelformat, -+ fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline); -+ -+ ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt); -+ if (ret) -+ return AVERROR(EINVAL); -+ -+ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__, -+ fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height, -+ fmt->fmt.pix_mp.pixelformat, -+ fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline); -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 || -+ fmt->fmt.pix_mp.field != field) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); -+ -+ return AVERROR(EINVAL); -+ } -+ } else { -+ if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 || -+ fmt->fmt.pix.field != field) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); -+ -+ return AVERROR(EINVAL); -+ } -+ } -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height, int pitch, int ysize) -+{ -+ struct v4l2_format *fmt = &queue->format; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ int ret; -+ -+ struct v4l2_selection sel = { -+ .type = fmt->type, -+ .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS, -+ }; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ fmt->fmt.pix_mp.field = field; -+ fmt->fmt.pix_mp.width = width; -+ fmt->fmt.pix_mp.height = ysize / pitch; -+ fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch; -+ fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1); -+ } else { -+ fmt->fmt.pix.field = field; -+ fmt->fmt.pix.width = width; -+ fmt->fmt.pix.height = height; -+ fmt->fmt.pix.sizeimage = 0; -+ fmt->fmt.pix.bytesperline = 0; -+ } -+ -+ ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt); -+ if (ret) -+ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret); -+ -+ ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel); -+ if (ret) -+ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_SELECTION failed: %d\n", ret); -+ -+ sel.r.width = width; -+ sel.r.height = height; -+ sel.r.left = 0; -+ sel.r.top = 0; -+ sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE, -+ sel.flags = V4L2_SEL_FLAG_LE; -+ -+ ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel); -+ if (ret) -+ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_SELECTION failed: %d\n", ret); -+ -+ return ret; -+} -+ -+static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node) -+{ -+ int ret; -+ -+ ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0); -+ if (ctx->fd < 0) -+ return AVERROR(errno); -+ -+ ret = deint_v4l2m2m_prepare_context(ctx); -+ if (ret) -+ goto fail; -+ -+ ret = deint_v4l2m2m_try_format(&ctx->capture); -+ if (ret) -+ goto fail; -+ -+ ret = deint_v4l2m2m_try_format(&ctx->output); -+ if (ret) -+ goto fail; -+ -+ return 0; -+ -+fail: -+ close(ctx->fd); -+ ctx->fd = -1; -+ -+ return ret; -+} -+ -+static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx) -+{ -+ int ret = AVERROR(EINVAL); -+ struct dirent *entry; -+ char node[PATH_MAX]; -+ DIR *dirp; -+ -+ dirp = opendir("/dev"); -+ if (!dirp) -+ return AVERROR(errno); -+ -+ for (entry = readdir(dirp); entry; entry = readdir(dirp)) { -+ -+ if (strncmp(entry->d_name, "video", 5)) -+ continue; -+ -+ snprintf(node, sizeof(node), "/dev/%s", entry->d_name); -+ av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node); -+ ret = deint_v4l2m2m_probe_device(ctx, node); -+ if (!ret) -+ break; -+ } -+ -+ closedir(dirp); -+ -+ if (ret) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n"); -+ ctx->fd = -1; -+ -+ return ret; -+ } -+ -+ av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node); -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf) -+{ -+ int ret; -+ -+ ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ buf->enqueued = 1; -+ -+ return 0; -+} -+ -+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) -+{ -+ struct v4l2_exportbuffer expbuf; -+ int i, ret; -+ -+ for (i = 0; i < avbuf->num_planes; i++) { -+ memset(&expbuf, 0, sizeof(expbuf)); -+ -+ expbuf.index = avbuf->buffer.index; -+ expbuf.type = avbuf->buffer.type; -+ expbuf.plane = i; -+ -+ ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ avbuf->fd = expbuf.fd; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) { -+ /* drm frame */ -+ avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length; -+ avbuf->drm_frame.objects[i].fd = expbuf.fd; -+ avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ } else { -+ /* drm frame */ -+ avbuf->drm_frame.objects[0].size = avbuf->buffer.length; -+ avbuf->drm_frame.objects[0].fd = expbuf.fd; -+ avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ } -+ } -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) -+{ -+ struct v4l2_format *fmt = &queue->format; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ struct v4l2_requestbuffers req; -+ int ret, i, j, multiplanar; -+ uint32_t memory; -+ -+ memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ? -+ V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; -+ -+ multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type); -+ -+ memset(&req, 0, sizeof(req)); -+ req.count = queue->num_buffers; -+ req.memory = memory; -+ req.type = fmt->type; -+ -+ ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req); -+ if (ret < 0) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno)); -+ -+ return AVERROR(errno); -+ } -+ -+ queue->num_buffers = req.count; -+ queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer)); -+ if (!queue->buffers) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n"); -+ -+ return AVERROR(ENOMEM); -+ } -+ -+ for (i = 0; i < queue->num_buffers; i++) { -+ V4L2Buffer *buf = &queue->buffers[i]; -+ -+ buf->enqueued = 0; -+ buf->fd = -1; -+ buf->q = queue; -+ -+ buf->buffer.type = fmt->type; -+ buf->buffer.memory = memory; -+ buf->buffer.index = i; -+ -+ if (multiplanar) { -+ buf->buffer.length = VIDEO_MAX_PLANES; -+ buf->buffer.m.planes = buf->planes; -+ } -+ -+ ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer); -+ if (ret < 0) { -+ ret = AVERROR(errno); -+ -+ goto fail; -+ } -+ -+ if (multiplanar) -+ buf->num_planes = buf->buffer.length; -+ else -+ buf->num_planes = 1; -+ -+ for (j = 0; j < buf->num_planes; j++) { -+ V4L2PlaneInfo *info = &buf->plane_info[j]; -+ -+ if (multiplanar) { -+ info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline; -+ info->length = buf->buffer.m.planes[j].length; -+ } else { -+ info->bytesperline = fmt->fmt.pix.bytesperline; -+ info->length = buf->buffer.length; -+ } -+ } -+ -+ if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) { -+ ret = deint_v4l2m2m_enqueue_buffer(buf); -+ if (ret) -+ goto fail; -+ -+ ret = v4l2_buffer_export_drm(buf); -+ if (ret) -+ goto fail; -+ } -+ } -+ -+ return 0; -+ -+fail: -+ for (i = 0; i < queue->num_buffers; i++) -+ if (queue->buffers[i].fd >= 0) -+ close(queue->buffers[i].fd); -+ av_free(queue->buffers); -+ queue->buffers = NULL; -+ -+ return ret; -+} -+ -+static int deint_v4l2m2m_streamon(V4L2Queue *queue) -+{ -+ DeintV4L2M2MContextShared * const ctx = queue->ctx; -+ int type = queue->format.type; -+ int ret; -+ -+ ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type); -+ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno)); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_streamoff(V4L2Queue *queue) -+{ -+ DeintV4L2M2MContextShared * const ctx = queue->ctx; -+ int type = queue->format.type; -+ int ret; -+ -+ ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type); -+ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno)); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ return 0; -+} -+ -+// timeout in ms -+static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout) -+{ -+ struct v4l2_plane planes[VIDEO_MAX_PLANES]; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ struct v4l2_buffer buf = { 0 }; -+ V4L2Buffer* avbuf = NULL; -+ struct pollfd pfd; -+ short events; -+ int ret; -+ -+ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) -+ events = POLLOUT | POLLWRNORM; -+ else -+ events = POLLIN | POLLRDNORM; -+ -+ pfd.events = events; -+ pfd.fd = ctx->fd; -+ -+ for (;;) { -+ ret = poll(&pfd, 1, timeout); -+ if (ret > 0) -+ break; -+ if (errno == EINTR) -+ continue; -+ return NULL; -+ } -+ -+ if (pfd.revents & POLLERR) -+ return NULL; -+ -+ if (pfd.revents & events) { -+ memset(&buf, 0, sizeof(buf)); -+ buf.memory = V4L2_MEMORY_MMAP; -+ buf.type = queue->format.type; -+ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { -+ memset(planes, 0, sizeof(planes)); -+ buf.length = VIDEO_MAX_PLANES; -+ buf.m.planes = planes; -+ } -+ -+ ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf); -+ if (ret) { -+ if (errno != EAGAIN) -+ av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n", -+ av_err2str(AVERROR(errno))); -+ return NULL; -+ } -+ -+ avbuf = &queue->buffers[buf.index]; -+ avbuf->enqueued = 0; -+ avbuf->buffer = buf; -+ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { -+ memcpy(avbuf->planes, planes, sizeof(planes)); -+ avbuf->buffer.m.planes = avbuf->planes; -+ } -+ return avbuf; -+ } -+ -+ return NULL; -+} -+ -+static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue) -+{ -+ int i; -+ V4L2Buffer *buf = NULL; -+ -+ for (i = 0; i < queue->num_buffers; i++) -+ if (!queue->buffers[i].enqueued) { -+ buf = &queue->buffers[i]; -+ break; -+ } -+ return buf; -+} -+ -+static void deint_v4l2m2m_unref_queued(V4L2Queue *queue) -+{ -+ int i; -+ V4L2Buffer *buf = NULL; -+ -+ if (!queue || !queue->buffers) -+ return; -+ for (i = 0; i < queue->num_buffers; i++) { -+ buf = &queue->buffers[i]; -+ if (queue->buffers[i].enqueued) -+ av_frame_unref(&buf->frame); -+ } -+} -+ -+static void recycle_q(V4L2Queue * const queue) -+{ -+ V4L2Buffer* avbuf; -+ while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) { -+ av_frame_unref(&avbuf->frame); -+ } -+} -+ -+static int count_enqueued(V4L2Queue *queue) -+{ -+ int i; -+ int n = 0; -+ -+ if (queue->buffers == NULL) -+ return 0; -+ -+ for (i = 0; i < queue->num_buffers; i++) -+ if (queue->buffers[i].enqueued) -+ ++n; -+ return n; -+} -+ -+static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame) -+{ -+ DeintV4L2M2MContextShared *const ctx = queue->ctx; -+ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0]; -+ V4L2Buffer *buf; -+ int i; -+ -+ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) -+ recycle_q(queue); -+ -+ buf = deint_v4l2m2m_find_free_buf(queue); -+ if (!buf) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0); -+ return AVERROR(EAGAIN); -+ } -+ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) -+ for (i = 0; i < drm_desc->nb_objects; i++) -+ buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd; -+ else -+ buf->buffer.m.fd = drm_desc->objects[0].fd; -+ -+ buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE : -+ frame->top_field_first ? V4L2_FIELD_INTERLACED_TB : -+ V4L2_FIELD_INTERLACED_BT; -+ -+ if (ctx->field_order != buf->buffer.field) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field); -+ ctx->field_order = buf->buffer.field; -+ } -+ -+ buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame); -+ -+ buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd; -+ -+ av_frame_move_ref(&buf->frame, frame); -+ -+ return deint_v4l2m2m_enqueue_buffer(buf); -+} -+ -+static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) -+{ -+ if (atomic_fetch_sub(&ctx->refcount, 1) == 1) { -+ V4L2Queue *capture = &ctx->capture; -+ V4L2Queue *output = &ctx->output; -+ int i; -+ -+ av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__); -+ -+ if (ctx->fd >= 0) { -+ deint_v4l2m2m_streamoff(capture); -+ deint_v4l2m2m_streamoff(output); -+ } -+ -+ if (capture->buffers) -+ for (i = 0; i < capture->num_buffers; i++) { -+ capture->buffers[i].q = NULL; -+ if (capture->buffers[i].fd >= 0) -+ close(capture->buffers[i].fd); -+ } -+ -+ deint_v4l2m2m_unref_queued(output); -+ -+ av_buffer_unref(&ctx->hw_frames_ctx); -+ -+ if (capture->buffers) -+ av_free(capture->buffers); -+ -+ if (output->buffers) -+ av_free(output->buffers); -+ -+ if (ctx->fd >= 0) { -+ close(ctx->fd); -+ ctx->fd = -1; -+ } -+ -+ av_free(ctx); -+ } -+} -+ -+static void v4l2_free_buffer(void *opaque, uint8_t *unused) -+{ -+ V4L2Buffer *buf = opaque; -+ DeintV4L2M2MContextShared *ctx = buf->q->ctx; -+ -+ if (!ctx->done) -+ deint_v4l2m2m_enqueue_buffer(buf); -+ -+ deint_v4l2m2m_destroy_context(ctx); -+} -+ -+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) -+{ -+ int av_pix_fmt = AV_PIX_FMT_YUV420P; -+ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; -+ AVDRMLayerDescriptor *layer; -+ -+ /* fill the DRM frame descriptor */ -+ drm_desc->nb_objects = avbuf->num_planes; -+ drm_desc->nb_layers = 1; -+ -+ layer = &drm_desc->layers[0]; -+ layer->nb_planes = avbuf->num_planes; -+ -+ for (int i = 0; i < avbuf->num_planes; i++) { -+ layer->planes[i].object_index = i; -+ layer->planes[i].offset = 0; -+ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; -+ } -+ -+ switch (av_pix_fmt) { -+ case AV_PIX_FMT_YUYV422: -+ -+ layer->format = DRM_FORMAT_YUYV; -+ layer->nb_planes = 1; -+ -+ break; -+ -+ case AV_PIX_FMT_NV12: -+ case AV_PIX_FMT_NV21: -+ -+ layer->format = av_pix_fmt == AV_PIX_FMT_NV12 ? -+ DRM_FORMAT_NV12 : DRM_FORMAT_NV21; -+ -+ if (avbuf->num_planes > 1) -+ break; -+ -+ layer->nb_planes = 2; -+ -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * -+ height; -+ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; -+ break; -+ -+ case AV_PIX_FMT_YUV420P: -+ -+ layer->format = DRM_FORMAT_YUV420; -+ -+ if (avbuf->num_planes > 1) -+ break; -+ -+ layer->nb_planes = 3; -+ -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * -+ height; -+ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1; -+ -+ layer->planes[2].object_index = 0; -+ layer->planes[2].offset = layer->planes[1].offset + -+ ((avbuf->plane_info[0].bytesperline * -+ height) >> 2); -+ layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1; -+ break; -+ -+ default: -+ drm_desc->nb_layers = 0; -+ break; -+ } -+ -+ return (uint8_t *) drm_desc; -+} -+ -+// timeout in ms -+static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout) -+{ -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ V4L2Buffer* avbuf; -+ -+ av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__); -+ -+ avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout); -+ if (!avbuf) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout); -+ return AVERROR(EAGAIN); -+ } -+ -+ // Fill in PTS and anciliary info from src frame -+ // we will want to overwrite some fields as only the pts/dts -+ // fields are updated with new timing in this fn -+ pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame); -+ -+ frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame, -+ sizeof(avbuf->drm_frame), v4l2_free_buffer, -+ avbuf, AV_BUFFER_FLAG_READONLY); -+ if (!frame->buf[0]) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0); -+ return AVERROR(ENOMEM); -+ } -+ -+ atomic_fetch_add(&ctx->refcount, 1); -+ -+ frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height); -+ frame->format = AV_PIX_FMT_DRM_PRIME; -+ if (ctx->hw_frames_ctx) -+ frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx); -+ frame->height = ctx->height; -+ frame->width = ctx->width; -+ -+ // Not interlaced now -+ frame->interlaced_frame = 0; -+ frame->top_field_first = 0; -+ // Pkt duration halved -+ frame->pkt_duration /= 2; -+ -+ if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n"); -+ frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM; -+ } -+ -+ av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts); -+ return 0; -+} -+ -+static int deint_v4l2m2m_config_props(AVFilterLink *outlink) -+{ -+ AVFilterLink *inlink = outlink->src->inputs[0]; -+ AVFilterContext *avctx = outlink->src; -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx = priv->shared; -+ int ret; -+ -+ ctx->height = avctx->inputs[0]->h; -+ ctx->width = avctx->inputs[0]->w; -+ -+ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height); -+ -+ outlink->time_base = inlink->time_base; -+ outlink->w = inlink->w; -+ outlink->h = inlink->h; -+ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; -+ outlink->format = inlink->format; -+ outlink->frame_rate = (AVRational) {1, 0}; // Deny knowledge of frame rate -+ -+ ret = deint_v4l2m2m_find_device(ctx); -+ if (ret) -+ return ret; -+ -+ if (inlink->hw_frames_ctx) { -+ ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx); -+ if (!ctx->hw_frames_ctx) -+ return AVERROR(ENOMEM); -+ } -+ return 0; -+} -+ -+static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) -+{ -+ AVFilterContext *avctx = link->dst; -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx = priv->shared; -+ V4L2Queue *capture = &ctx->capture; -+ V4L2Queue *output = &ctx->output; -+ int ret; -+ -+ av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n", -+ __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den); -+ av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__, -+ avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out); -+ -+ if (ctx->field_order == V4L2_FIELD_ANY) { -+ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0]; -+ ctx->orig_width = drm_desc->layers[0].planes[0].pitch; -+ ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width; -+ -+ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height, -+ drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset); -+ -+ if (in->top_field_first) -+ ctx->field_order = V4L2_FIELD_INTERLACED_TB; -+ else -+ ctx->field_order = V4L2_FIELD_INTERLACED_BT; -+ -+ ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_allocate_buffers(capture); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_streamon(capture); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_allocate_buffers(output); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_streamon(output); -+ if (ret) -+ return ret; -+ } -+ -+ ret = deint_v4l2m2m_enqueue_frame(output, in); -+ -+ av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret)); -+ return ret; -+} -+ -+static int deint_v4l2m2m_activate(AVFilterContext *avctx) -+{ -+ DeintV4L2M2MContext * const priv = avctx->priv; -+ DeintV4L2M2MContextShared *const s = priv->shared; -+ AVFilterLink * const outlink = avctx->outputs[0]; -+ AVFilterLink * const inlink = avctx->inputs[0]; -+ int n = 0; -+ int cn = 99; -+ int instatus = 0; -+ int64_t inpts = 0; -+ int did_something = 0; -+ -+ av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__); -+ -+ FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx); -+ -+ ff_inlink_acknowledge_status(inlink, &instatus, &inpts); -+ -+ if (!ff_outlink_frame_wanted(outlink)) { -+ av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__); -+ } -+ else if (s->field_order != V4L2_FIELD_ANY) // Can't DQ if no setup! -+ { -+ AVFrame * frame = av_frame_alloc(); -+ int rv; -+ -+again: -+ recycle_q(&s->output); -+ n = count_enqueued(&s->output); -+ -+ if (frame == NULL) { -+ av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__); -+ return AVERROR(ENOMEM); -+ } -+ -+ rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0); -+ if (rv != 0) { -+ av_frame_free(&frame); -+ if (rv != AVERROR(EAGAIN)) { -+ av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv)); -+ return rv; -+ } -+ } -+ else { -+ frame->interlaced_frame = 0; -+ // frame is always consumed by filter_frame - even on error despite -+ // a somewhat confusing comment in the header -+ rv = ff_filter_frame(outlink, frame); -+ -+ if (instatus != 0) { -+ av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__); -+ goto again; -+ } -+ -+ av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv)); -+ did_something = 1; -+ } -+ -+ cn = count_enqueued(&s->capture); -+ } -+ -+ if (instatus != 0) { -+ ff_outlink_set_status(outlink, instatus, inpts); -+ av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus)); -+ return 0; -+ } -+ -+ { -+ AVFrame * frame; -+ int rv; -+ -+ recycle_q(&s->output); -+ n = count_enqueued(&s->output); -+ -+ while (n < 6) { -+ if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) { -+ av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv)); -+ return rv; -+ } -+ -+ if (frame == NULL) { -+ av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__); -+ break; -+ } -+ -+ deint_v4l2m2m_filter_frame(inlink, frame); -+ av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__); -+ ++n; -+ } -+ } -+ -+ if (n < 6) { -+ ff_inlink_request_frame(inlink); -+ did_something = 1; -+ av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__); -+ } -+ -+ if (n > 4 && ff_outlink_frame_wanted(outlink)) { -+ ff_filter_set_ready(avctx, 1); -+ did_something = 1; -+ av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__); -+ } -+ -+ av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn); -+ return did_something ? 0 : FFERROR_NOT_READY; -+} -+ -+static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) -+{ -+ DeintV4L2M2MContext * const priv = avctx->priv; -+ DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared)); -+ -+ if (!ctx) { -+ av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0); -+ return AVERROR(ENOMEM); -+ } -+ priv->shared = ctx; -+ ctx->logctx = priv; -+ ctx->fd = -1; -+ ctx->output.ctx = ctx; -+ ctx->output.num_buffers = 8; -+ ctx->capture.ctx = ctx; -+ ctx->capture.num_buffers = 12; -+ ctx->done = 0; -+ ctx->field_order = V4L2_FIELD_ANY; -+ -+ pts_track_init(&ctx->track, priv); -+ -+ atomic_init(&ctx->refcount, 1); -+ -+ return 0; -+} -+ -+static void deint_v4l2m2m_uninit(AVFilterContext *avctx) -+{ -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx = priv->shared; -+ -+ ctx->done = 1; -+ ctx->logctx = NULL; // Log to NULL works, log to missing crashes -+ pts_track_uninit(&ctx->track); -+ deint_v4l2m2m_destroy_context(ctx); -+} -+ -+static const AVOption deinterlace_v4l2m2m_options[] = { -+ { NULL }, -+}; -+ -+AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m); -+ -+static const AVFilterPad deint_v4l2m2m_inputs[] = { -+ { -+ .name = "default", -+ .type = AVMEDIA_TYPE_VIDEO, -+ }, -+}; -+ -+static const AVFilterPad deint_v4l2m2m_outputs[] = { -+ { -+ .name = "default", -+ .type = AVMEDIA_TYPE_VIDEO, -+ .config_props = deint_v4l2m2m_config_props, -+ }, -+}; -+ -+AVFilter ff_vf_deinterlace_v4l2m2m = { -+ .name = "deinterlace_v4l2m2m", -+ .description = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"), -+ .priv_size = sizeof(DeintV4L2M2MContext), -+ .init = &deint_v4l2m2m_init, -+ .uninit = &deint_v4l2m2m_uninit, -+ FILTER_INPUTS(deint_v4l2m2m_inputs), -+ FILTER_OUTPUTS(deint_v4l2m2m_outputs), -+ FILTER_SINGLE_SAMPLEFMT(AV_PIX_FMT_DRM_PRIME), -+ .priv_class = &deinterlace_v4l2m2m_class, -+ .activate = deint_v4l2m2m_activate, -+}; - -From 0fb00e51d1ca40eed22bfc66b7f309fdc56229bc Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 2 Dec 2021 17:49:55 +0000 -Subject: [PATCH 031/136] Put no_pts_rescale in context which makes more sense - than an arg - ---- - libavcodec/v4l2_buffers.c | 28 ++++++++++++++-------------- - libavcodec/v4l2_buffers.h | 5 ++--- - libavcodec/v4l2_context.c | 8 ++++---- - libavcodec/v4l2_context.h | 13 +++++++++---- - libavcodec/v4l2_m2m_dec.c | 9 +++++---- - 5 files changed, 34 insertions(+), 29 deletions(-) - -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 126d2a17f4..22da6bd722 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -39,7 +39,7 @@ - #define USEC_PER_SEC 1000000 - static const AVRational v4l2_timebase = { 1, USEC_PER_SEC }; - --static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf) -+static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf) - { - return V4L2_TYPE_IS_OUTPUT(buf->context->type) ? - container_of(buf->context, V4L2m2mContext, output) : -@@ -51,34 +51,34 @@ static inline AVCodecContext *logger(V4L2Buffer *buf) - return buf_to_m2mctx(buf)->avctx; - } - --static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf) -+static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf) - { -- V4L2m2mContext *s = buf_to_m2mctx(avbuf); -+ const V4L2m2mContext *s = buf_to_m2mctx(avbuf); - const AVRational tb = s->avctx->pkt_timebase.num ? - s->avctx->pkt_timebase : - s->avctx->time_base; - return tb.num && tb.den ? tb : v4l2_timebase; - } - --static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts, int no_rescale) -+static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts) - { - /* convert pts to v4l2 timebase */ - const int64_t v4l2_pts = -- no_rescale ? pts : -+ out->context->no_pts_rescale ? pts : - pts == AV_NOPTS_VALUE ? 0 : - av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); - out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC; - out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC; - } - --static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf, int no_rescale) -+static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf) - { - /* convert pts back to encoder timebase */ - const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC + - avbuf->buf.timestamp.tv_usec; - - return -- no_rescale ? v4l2_pts : -+ avbuf->context->no_pts_rescale ? v4l2_pts : - v4l2_pts == 0 ? AV_NOPTS_VALUE : - av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); - } -@@ -686,13 +686,13 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) - v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc); - v4l2_set_color_range(out, frame->color_range); - // PTS & interlace are buffer vars -- v4l2_set_pts(out, frame->pts, 0); -+ v4l2_set_pts(out, frame->pts); - v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first); - - return v4l2_buffer_swframe_to_buf(frame, out); - } - --int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_rescale_pts) -+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) - { - int ret; - V4L2Context * const ctx = avbuf->context; -@@ -710,7 +710,7 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_resc - frame->colorspace = v4l2_get_color_space(avbuf); - frame->color_range = v4l2_get_color_range(avbuf); - frame->color_trc = v4l2_get_color_trc(avbuf); -- frame->pts = v4l2_get_pts(avbuf, no_rescale_pts); -+ frame->pts = v4l2_get_pts(avbuf); - frame->pkt_dts = AV_NOPTS_VALUE; - frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf); - frame->top_field_first = v4l2_buf_is_top_first(avbuf); -@@ -757,13 +757,13 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) - pkt->flags |= AV_PKT_FLAG_CORRUPT; - } - -- pkt->dts = pkt->pts = v4l2_get_pts(avbuf, 0); -+ pkt->dts = pkt->pts = v4l2_get_pts(avbuf); - - return 0; - } - - int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, -- const void *extdata, size_t extlen, int no_rescale_pts) -+ const void *extdata, size_t extlen) - { - int ret; - -@@ -777,7 +777,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, - if (ret && ret != AVERROR(ENOMEM)) - return ret; - -- v4l2_set_pts(out, pkt->pts, no_rescale_pts); -+ v4l2_set_pts(out, pkt->pts); - - if (pkt->flags & AV_PKT_FLAG_KEY) - out->flags = V4L2_BUF_FLAG_KEYFRAME; -@@ -787,7 +787,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, - - int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) - { -- return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0); -+ return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0); - } - - -diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h -index 111526aee3..641e0e147b 100644 ---- a/libavcodec/v4l2_buffers.h -+++ b/libavcodec/v4l2_buffers.h -@@ -83,12 +83,11 @@ typedef struct V4L2Buffer { - * - * @param[in] frame The AVFRame to push the information to - * @param[in] buf The V4L2Buffer to get the information from -- * @param[in] no_rescale_pts If non-zero do not rescale PTS - * - * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect, - * AVERROR(ENOMEM) if the AVBufferRef can't be created. - */ --int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf, int no_rescale_pts); -+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf); - - /** - * Extracts the data from a V4L2Buffer to an AVPacket -@@ -113,7 +112,7 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf); - int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out); - - int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, -- const void *extdata, size_t extlen, int no_rescale_pts); -+ const void *extdata, size_t extlen); - - /** - * Extracts the data from an AVFrame to a V4L2Buffer -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index 440dfaaba5..64540a37b3 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -808,7 +808,7 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) - } - - int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, -- const void * extdata, size_t extlen, int no_rescale_pts) -+ const void * extdata, size_t extlen) - { - V4L2m2mContext *s = ctx_to_m2mctx(ctx); - V4L2Buffer* avbuf; -@@ -827,7 +827,7 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, - if (!avbuf) - return AVERROR(EAGAIN); - -- ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts); -+ ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen); - if (ret == AVERROR(ENOMEM)) - av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n", - __func__, pkt->size, avbuf->planes[0].length); -@@ -837,7 +837,7 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, - return ff_v4l2_buffer_enqueue(avbuf); - } - --int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, int no_rescale_pts) -+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) - { - V4L2Buffer *avbuf; - -@@ -854,7 +854,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, - return AVERROR(EAGAIN); - } - -- return ff_v4l2_buffer_buf_to_avframe(frame, avbuf, no_rescale_pts); -+ return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); - } - - int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) -diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h -index 37b0431400..4cc164886c 100644 ---- a/libavcodec/v4l2_context.h -+++ b/libavcodec/v4l2_context.h -@@ -102,6 +102,13 @@ typedef struct V4L2Context { - */ - int done; - -+ /** -+ * PTS rescale not wanted -+ * If the PTS is just a dummy frame count then rescale is -+ * actively harmful -+ */ -+ int no_pts_rescale; -+ - AVBufferRef *frames_ref; - int q_count; - int dq_count; -@@ -172,12 +179,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); - * @param[in] ctx The V4L2Context to dequeue from. - * @param[inout] f The AVFrame to dequeue to. - * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds) -- * @param[in] no_rescale_pts (0 rescale pts, 1 use pts as -- * timestamp directly) - * - * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. - */ --int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int no_rescale_pts); -+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); - - /** - * Enqueues a buffer to a V4L2Context from an AVPacket -@@ -189,7 +194,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int - * @param[in] pkt A pointer to an AVPacket. - * @return 0 in case of success, a negative error otherwise. - */ --int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size, int no_rescale_pts); -+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size); - - /** - * Enqueues a buffer to a V4L2Context from an AVFrame -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index b49f470c0a..36754b314a 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -360,7 +360,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const - if (!s->draining) { - // Calling enqueue with an empty pkt starts drain - av_assert0(s->buf_pkt.size == 0); -- ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0, 1); -+ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); - if (ret) { - av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret); - return ret; -@@ -381,8 +381,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const - return ret; - - ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, -- avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size, -- 1); -+ avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size); - - if (ret == AVERROR(EAGAIN)) { - // Out of input buffers - keep packet -@@ -442,7 +441,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - // when discarding - // This returns AVERROR(EAGAIN) if there isn't a frame ready yet - // but there is room in the input Q -- dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1, 1); -+ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1); - - if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) - av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", -@@ -569,10 +568,12 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - output->av_codec_id = avctx->codec_id; - output->av_pix_fmt = AV_PIX_FMT_NONE; - output->min_buf_size = max_coded_size(avctx); -+ output->no_pts_rescale = 1; - - capture->av_codec_id = AV_CODEC_ID_RAWVIDEO; - capture->av_pix_fmt = avctx->pix_fmt; - capture->min_buf_size = 0; -+ capture->no_pts_rescale = 1; - - /* the client requests the codec to generate DRM frames: - * - data[0] will therefore point to the returned AVDRMFrameDescriptor - -From 5e36908e6f2f06b68e85873cbcd421c0973f6409 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 8 Dec 2021 15:00:37 +0000 -Subject: [PATCH 032/136] Use bitbuf min size for all streams - ---- - libavcodec/v4l2_m2m_dec.c | 5 +---- - 1 file changed, 1 insertion(+), 4 deletions(-) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 36754b314a..48a6810d18 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -507,15 +507,12 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - } - #endif - -+// This heuristic is for H264 but use for everything - static uint32_t max_coded_size(const AVCodecContext * const avctx) - { - uint32_t wxh = avctx->coded_width * avctx->coded_height; - uint32_t size; - -- // Currently the only thing we try to set our own limits for is H264 -- if (avctx->codec_id != AV_CODEC_ID_H264) -- return 0; -- - size = wxh * 3 / 2; - // H.264 Annex A table A-1 gives minCR which is either 2 or 4 - // unfortunately that doesn't yield an actually useful limit - -From 5fcbcd31761eea31dc0157793f558eaaadfe2ac3 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Fri, 3 Dec 2021 12:54:18 +0000 -Subject: [PATCH 033/136] Track pending frames in v4l2 stateful - -Track which frames are pending decode in the v4l2 stateful decoder. -This relies on DTS & PTS having some relationship to reality, so -any use of this code must cope with the results being wrong. - -Also moves the xlat state vars out of the main context and into their -own structure. ---- - libavcodec/v4l2_m2m.h | 15 ++++-- - libavcodec/v4l2_m2m_dec.c | 100 +++++++++++++++++++++++++++++--------- - 2 files changed, 89 insertions(+), 26 deletions(-) - -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index 82feb0afdb..3f86809623 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -44,8 +44,10 @@ - #define FF_V4L2_M2M_TRACK_SIZE 128 - typedef struct V4L2m2mTrackEl { - int discard; // If we see this buffer its been flushed, so discard -+ int pending; - int pkt_size; - int64_t pts; -+ int64_t dts; - int64_t reordered_opaque; - int64_t pkt_pos; - int64_t pkt_duration; -@@ -62,6 +64,14 @@ typedef struct pts_stats_s - int64_t guess; - } pts_stats_t; - -+typedef struct xlat_track_s { -+ unsigned int track_no; -+ int64_t last_pts; -+ int64_t last_pkt_dts; -+ int64_t last_opaque; -+ V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; -+} xlat_track_t; -+ - typedef struct V4L2m2mContext { - char devname[PATH_MAX]; - int fd; -@@ -96,10 +106,7 @@ typedef struct V4L2m2mContext { - int output_drm; - - /* Frame tracking */ -- int64_t last_pkt_dts; -- int64_t last_opaque; -- unsigned int track_no; -- V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; -+ xlat_track_t xlat; - - pts_stats_t pts_stat; - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 48a6810d18..d8ebb466cd 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -242,22 +242,24 @@ static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts - // buffer of all the things we want preserved (including the original PTS) - // indexed by the tracking no. - static void --xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *const avpkt) -+xlat_pts_in(AVCodecContext *const avctx, xlat_track_t *const x, AVPacket *const avpkt) - { - int64_t track_pts; - - // Avoid 0 -- if (++s->track_no == 0) -- s->track_no = 1; -+ if (++x->track_no == 0) -+ x->track_no = 1; - -- track_pts = track_to_pts(avctx, s->track_no); -+ track_pts = track_to_pts(avctx, x->track_no); - -- av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, s->track_no); -- s->last_pkt_dts = avpkt->dts; -- s->track_els[s->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ -+ av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no); -+ x->last_pkt_dts = avpkt->dts; -+ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ - .discard = 0, -+ .pending = 1, - .pkt_size = avpkt->size, - .pts = avpkt->pts, -+ .dts = avpkt->dts, - .reordered_opaque = avctx->reordered_opaque, - .pkt_pos = avpkt->pos, - .pkt_duration = avpkt->duration, -@@ -268,31 +270,36 @@ xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *cons - - // Returns -1 if we should discard the frame - static int --xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *const frame) -+xlat_pts_out(AVCodecContext *const avctx, -+ xlat_track_t * const x, -+ pts_stats_t * const ps, -+ AVFrame *const frame) - { - unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE; -- const V4L2m2mTrackEl *const t = s->track_els + n; -+ V4L2m2mTrackEl *const t = x->track_els + n; - if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts) - { - av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); - frame->pts = AV_NOPTS_VALUE; -- frame->pkt_dts = s->last_pkt_dts; -- frame->reordered_opaque = s->last_opaque; -+ frame->pkt_dts = x->last_pkt_dts; -+ frame->reordered_opaque = x->last_opaque; - frame->pkt_pos = -1; - frame->pkt_duration = 0; - frame->pkt_size = -1; - } - else if (!t->discard) - { -- frame->pts = t->pts; -- frame->pkt_dts = s->last_pkt_dts; -+ frame->pts = t->pending ? t->pts : AV_NOPTS_VALUE; -+ frame->pkt_dts = x->last_pkt_dts; - frame->reordered_opaque = t->reordered_opaque; - frame->pkt_pos = t->pkt_pos; - frame->pkt_duration = t->pkt_duration; - frame->pkt_size = t->pkt_size; - -- s->last_opaque = s->track_els[n].reordered_opaque; -- s->track_els[n].pts = AV_NOPTS_VALUE; // If we hit this again deny accurate knowledge of PTS -+ x->last_opaque = x->track_els[n].reordered_opaque; -+ if (frame->pts != AV_NOPTS_VALUE) -+ x->last_pts = frame->pts; -+ t->pending = 0; - } - else - { -@@ -300,14 +307,62 @@ xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *cons - return -1; - } - -- pts_stats_add(&s->pts_stat, frame->pts); -+ pts_stats_add(ps, frame->pts); - -- frame->best_effort_timestamp = pts_stats_guess(&s->pts_stat); -+ frame->best_effort_timestamp = pts_stats_guess(ps); - frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? - av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts); - return 0; - } - -+static void -+xlat_flush(xlat_track_t * const x) -+{ -+ unsigned int i; -+ for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) { -+ x->track_els[i].pending = 0; -+ x->track_els[i].discard = 1; -+ } -+ x->last_pts = AV_NOPTS_VALUE; -+} -+ -+static void -+xlat_init(xlat_track_t * const x) -+{ -+ memset(x, 0, sizeof(*x)); -+ x->last_pts = AV_NOPTS_VALUE; -+} -+ -+static int -+xlat_pending(const xlat_track_t * const x) -+{ -+ unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE; -+ unsigned int i; -+ int r = 0; -+ int64_t now = AV_NOPTS_VALUE; -+ -+ for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) { -+ const V4L2m2mTrackEl * const t = x->track_els + n; -+ -+ if (!t->pending) -+ continue; -+ -+ if (now == AV_NOPTS_VALUE) -+ now = t->dts; -+ -+ if (t->pts == AV_NOPTS_VALUE || -+ ((now == AV_NOPTS_VALUE || t->pts <= now) && -+ (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts))) -+ ++r; -+ } -+ -+ // If we never get any ideas about PTS vs DTS allow a lot more buffer -+ if (now == AV_NOPTS_VALUE) -+ r -= 16; -+ -+ return r; -+} -+ - static inline int stream_started(const V4L2m2mContext * const s) { - return s->capture.streamon && s->output.streamon; - } -@@ -374,7 +429,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const - return ret; - } - -- xlat_pts_in(avctx, s, &s->buf_pkt); -+ xlat_pts_in(avctx, &s->xlat, &s->buf_pkt); - } - - if ((ret = check_output_streamon(avctx, s)) != 0) -@@ -417,6 +472,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - int dst_rv = 1; // Non-zero (done), non-negative (error) number - - do { -+ av_log(avctx, AV_LOG_INFO, "Pending=%d\n", xlat_pending(&s->xlat)); - src_rv = try_enqueue_src(avctx, s); - - // If we got a frame last time and we have nothing to enqueue then -@@ -451,7 +507,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - s->draining, s->capture.done, dst_rv); - - // Go again if we got a frame that we need to discard -- } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame)); -+ } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame)); - } - - // Continue trying to enqueue packets if either -@@ -550,6 +606,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - if (ret < 0) - return ret; - -+ xlat_init(&s->xlat); - pts_stats_init(&s->pts_stat, avctx, "decoder"); - - capture = &s->capture; -@@ -632,7 +689,7 @@ static void v4l2_decode_flush(AVCodecContext *avctx) - V4L2m2mContext * const s = priv->context; - V4L2Context * const output = &s->output; - V4L2Context * const capture = &s->capture; -- int ret, i; -+ int ret; - - av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon); - -@@ -646,8 +703,7 @@ static void v4l2_decode_flush(AVCodecContext *avctx) - - // V4L2 makes no guarantees about whether decoded frames are flushed or not - // so mark all frames we are tracking to be discarded if they appear -- for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) -- s->track_els[i].discard = 1; -+ xlat_flush(&s->xlat); - - // resend extradata - s->extdata_sent = 0; - -From 6fae7b3f42c8e9e431a59323c0faa6c88fe951d9 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 15 Dec 2021 17:58:21 +0000 -Subject: [PATCH 034/136] Use pending tracking to reduce v4l2 latency - -If there are more than 5 pending decodes outstanding then add a small -timeout to the capture poll to reduce the rate at which frames are -added. ---- - libavcodec/v4l2_m2m_dec.c | 58 ++++++++++++++++++++++++--------------- - 1 file changed, 36 insertions(+), 22 deletions(-) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index d8ebb466cd..7e7e4729d0 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -370,16 +370,19 @@ static inline int stream_started(const V4L2m2mContext * const s) { - #define NQ_OK 0 - #define NQ_Q_FULL 1 - #define NQ_SRC_EMPTY 2 --#define NQ_DRAINING 3 --#define NQ_DEAD 4 -+#define NQ_NONE 3 -+#define NQ_DRAINING 4 -+#define NQ_DEAD 5 - - #define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING) -+#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE) - - // AVERROR_EOF Flushing an already flushed stream - // -ve Error (all errors except EOF are unexpected) - // NQ_OK (0) OK - // NQ_Q_FULL Dst full (retry if we think V4L2 Q has space now) - // NQ_SRC_EMPTY Src empty (do not retry) -+// NQ_NONE Enqueue not attempted - // NQ_DRAINING At EOS, dQ dest until EOS there too - // NQ_DEAD Not running (do not retry, do not attempt capture dQ) - -@@ -468,23 +471,28 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const - static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - { - V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; -- int src_rv; -+ int src_rv = NQ_NONE; - int dst_rv = 1; // Non-zero (done), non-negative (error) number -+ unsigned int i = 0; - - do { -- av_log(avctx, AV_LOG_INFO, "Pending=%d\n", xlat_pending(&s->xlat)); -- src_rv = try_enqueue_src(avctx, s); -- -- // If we got a frame last time and we have nothing to enqueue then -- // return now. rv will be AVERROR(EAGAIN) indicating that we want more input -- // This should mean that once decode starts we enter a stable state where -- // we alternately ask for input and produce output -- if (s->req_pkt && src_rv == NQ_SRC_EMPTY) -- break; -- -- if (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) { -- av_log(avctx, AV_LOG_WARNING, "Poll says src Q has space but enqueue fail"); -- src_rv = NQ_SRC_EMPTY; // If we can't enqueue pretend that there is nothing to enqueue -+ const int pending = xlat_pending(&s->xlat); -+ const int prefer_dq = (pending > 5); -+ -+ // Enqueue another pkt for decode if -+ // (a) We don't have a lot of stuff in the buffer already OR -+ // (b) ... we (think we) do but we've failed to get a frame already OR -+ // (c) We've dequeued a lot of frames without asking for input -+ if (!prefer_dq || i != 0 || s->req_pkt > 2) { -+ src_rv = try_enqueue_src(avctx, s); -+ -+ // If we got a frame last time or we've already tried to get a frame and -+ // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN) -+ // indicating that we want more input. -+ // This should mean that once decode starts we enter a stable state where -+ // we alternately ask for input and produce output -+ if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY) -+ break; - } - - // Try to get a new frame if -@@ -495,9 +503,9 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - // Dequeue frame will unref any previous contents of frame - // if it returns success so we don't need an explicit unref - // when discarding -- // This returns AVERROR(EAGAIN) if there isn't a frame ready yet -- // but there is room in the input Q -- dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1); -+ // This returns AVERROR(EAGAIN) on timeout or if -+ // there is room in the input Q and timeout == -1 -+ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, prefer_dq ? 5 : -1); - - if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) - av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", -@@ -510,10 +518,16 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame)); - } - -+ ++i; -+ if (i >= 256) { -+ av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i); -+ src_rv = AVERROR(EIO); -+ } -+ - // Continue trying to enqueue packets if either - // (a) we succeeded last time OR -- // (b) enqueue failed due to input Q full AND there is now room -- } while (src_rv == NQ_OK || (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) ); -+ // (b) we didn't ret a frame and we can retry the input -+ } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv))); - - // Ensure that the frame contains nothing if we aren't returning a frame - // (might happen when discarding) -@@ -521,7 +535,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - av_frame_unref(frame); - - // If we got a frame this time ask for a pkt next time -- s->req_pkt = (dst_rv == 0); -+ s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0; - - #if 0 - if (dst_rv == 0) - -From 175abd2eb961a3718a660e1f9eda08b37b01b309 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 15 Dec 2021 12:23:54 +0000 -Subject: [PATCH 035/136] Allow logger() to take const ctx - ---- - libavcodec/v4l2_buffers.c | 2 +- - libavcodec/v4l2_context.c | 4 ++-- - 2 files changed, 3 insertions(+), 3 deletions(-) - -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 22da6bd722..39c0094aec 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -46,7 +46,7 @@ static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf) - container_of(buf->context, V4L2m2mContext, capture); - } - --static inline AVCodecContext *logger(V4L2Buffer *buf) -+static inline AVCodecContext *logger(const V4L2Buffer * const buf) - { - return buf_to_m2mctx(buf)->avctx; - } -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index 64540a37b3..d3df48aed4 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -43,14 +43,14 @@ struct v4l2_format_update { - int update_avfmt; - }; - --static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx) -+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx) - { - return V4L2_TYPE_IS_OUTPUT(ctx->type) ? - container_of(ctx, V4L2m2mContext, output) : - container_of(ctx, V4L2m2mContext, capture); - } - --static inline AVCodecContext *logger(V4L2Context *ctx) -+static inline AVCodecContext *logger(const V4L2Context *ctx) - { - return ctx_to_m2mctx(ctx)->avctx; - } - -From 21d4f3f644c45084c621cb5aa577169bf5c15017 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 15 Dec 2021 13:00:27 +0000 -Subject: [PATCH 036/136] Track numbere of bufs qed with an atomic - -Safer and faster than counting status ---- - libavcodec/v4l2_buffers.c | 6 +++--- - libavcodec/v4l2_context.c | 3 ++- - libavcodec/v4l2_context.h | 3 +-- - 3 files changed, 6 insertions(+), 6 deletions(-) - -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 39c0094aec..2cf7be6632 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -922,6 +922,7 @@ fail: - int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) - { - int ret; -+ int qc; - - avbuf->buf.flags = avbuf->flags; - -@@ -941,11 +942,10 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) - return AVERROR(err); - } - -- ++avbuf->context->q_count; -+ qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1; - av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", - avbuf->context->name, avbuf->buf.index, -- avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, -- avbuf->context->q_count); -+ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc); - - avbuf->status = V4L2BUF_IN_DRIVER; - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index d3df48aed4..268a057e53 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -599,7 +599,7 @@ static int v4l2_release_buffers(V4L2Context* ctx) - " 2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n"); - } - } -- ctx->q_count = 0; -+ atomic_store(&ctx->q_count, 0); - - return ret; - } -@@ -1019,6 +1019,7 @@ int ff_v4l2_context_init(V4L2Context* ctx) - } - - ff_mutex_init(&ctx->lock, NULL); -+ atomic_init(&ctx->q_count, 0); - - if (s->output_drm) { - AVHWFramesContext *hwframes; -diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h -index 4cc164886c..a4176448d5 100644 ---- a/libavcodec/v4l2_context.h -+++ b/libavcodec/v4l2_context.h -@@ -110,8 +110,7 @@ typedef struct V4L2Context { - int no_pts_rescale; - - AVBufferRef *frames_ref; -- int q_count; -- int dq_count; -+ atomic_int q_count; - struct ff_weak_link_master *wl_master; - - AVMutex lock; - -From b2fa4ab3d63924597b8c3659123b145a786a2c13 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 9 Dec 2021 12:01:25 +0000 -Subject: [PATCH 037/136] Clear pkt_buf on flush - ---- - libavcodec/v4l2_m2m_dec.c | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 7e7e4729d0..09ec496351 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -715,6 +715,9 @@ static void v4l2_decode_flush(AVCodecContext *avctx) - if (ret < 0) - av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret); - -+ // Clear any buffered input packet -+ av_packet_unref(&s->buf_pkt); -+ - // V4L2 makes no guarantees about whether decoded frames are flushed or not - // so mark all frames we are tracking to be discarded if they appear - xlat_flush(&s->xlat); - -From 16cf94cb5e1d11f4c3a6b8a43557383ce78112e0 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 15 Dec 2021 12:52:56 +0000 -Subject: [PATCH 038/136] Rework v4l2 buffer dequeue - ---- - libavcodec/v4l2_context.c | 543 ++++++++++++++++++-------------------- - libavcodec/v4l2_context.h | 2 + - libavcodec/v4l2_m2m.c | 1 - - libavcodec/v4l2_m2m.h | 16 +- - libavcodec/v4l2_m2m_dec.c | 138 ++++------ - 5 files changed, 327 insertions(+), 373 deletions(-) - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index 268a057e53..d765181645 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -73,19 +73,27 @@ static AVRational v4l2_get_sar(V4L2Context *ctx) - return sar; - } - --static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2) -+static inline int ctx_buffers_alloced(const V4L2Context * const ctx) - { -- struct v4l2_format *fmt1 = &ctx->format; -- int ret = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? -- fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width || -- fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height -- : -- fmt1->fmt.pix.width != fmt2->fmt.pix.width || -- fmt1->fmt.pix.height != fmt2->fmt.pix.height; -+ return ctx->bufrefs != NULL; -+} -+ -+// Width/Height changed or we don't have an alloc in the first place? -+static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2) -+{ -+ const struct v4l2_format *fmt1 = &ctx->format; -+ int ret = !ctx_buffers_alloced(ctx) || -+ (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? -+ fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width || -+ fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height -+ : -+ fmt1->fmt.pix.width != fmt2->fmt.pix.width || -+ fmt1->fmt.pix.height != fmt2->fmt.pix.height); - - if (ret) -- av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n", -+ av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n", - ctx->name, -+ ctx_buffers_alloced(ctx), - ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1), - ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2)); - -@@ -167,10 +175,8 @@ static int do_source_change(V4L2m2mContext * const s) - - int ret; - int reinit; -- int full_reinit; - struct v4l2_format cap_fmt = s->capture.format; - -- s->resize_pending = 0; - s->capture.done = 0; - - ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt); -@@ -179,15 +185,21 @@ static int do_source_change(V4L2m2mContext * const s) - return 0; - } - -- s->output.sample_aspect_ratio = v4l2_get_sar(&s->output); -- - get_default_selection(&s->capture, &s->capture.selection); - -- reinit = v4l2_resolution_changed(&s->capture, &cap_fmt); -+ reinit = ctx_resolution_changed(&s->capture, &cap_fmt); -+ s->capture.format = cap_fmt; - if (reinit) { - s->capture.height = ff_v4l2_get_format_height(&cap_fmt); - s->capture.width = ff_v4l2_get_format_width(&cap_fmt); - } -+ -+ // If we don't support selection (or it is bust) and we obviously have HD then kludge -+ if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) && -+ (s->capture.height == 1088 && s->capture.width == 1920)) { -+ s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080}; -+ } -+ - s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); - - av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n", -@@ -195,11 +207,11 @@ static int do_source_change(V4L2m2mContext * const s) - s->capture.selection.width, s->capture.selection.height, - s->capture.selection.left, s->capture.selection.top); - -- s->reinit = 1; -- - if (reinit) { - if (avctx) -- ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height); -+ ret = ff_set_dimensions(s->avctx, -+ s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width, -+ s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height); - if (ret < 0) - av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n"); - -@@ -208,11 +220,22 @@ static int do_source_change(V4L2m2mContext * const s) - av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n"); - return AVERROR(EINVAL); - } -+ -+ // Update pixel format - should only actually do something on initial change -+ s->capture.av_pix_fmt = -+ ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO); -+ if (s->output_drm) { -+ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; -+ avctx->sw_pix_fmt = s->capture.av_pix_fmt; -+ } -+ else -+ avctx->pix_fmt = s->capture.av_pix_fmt; -+ - goto reinit_run; - } - - /* Buffers are OK so just stream off to ack */ -- av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only\n", __func__); -+ av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__); - - ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); - if (ret) -@@ -225,54 +248,6 @@ reinit_run: - return 1; - } - --static int ctx_done(V4L2Context * const ctx) --{ -- int rv = 0; -- V4L2m2mContext * const s = ctx_to_m2mctx(ctx); -- -- ctx->done = 1; -- -- if (s->resize_pending && !V4L2_TYPE_IS_OUTPUT(ctx->type)) -- rv = do_source_change(s); -- -- return rv; --} -- --/** -- * handle resolution change event and end of stream event -- * returns 1 if reinit was successful, negative if it failed -- * returns 0 if reinit was not executed -- */ --static int v4l2_handle_event(V4L2Context *ctx) --{ -- V4L2m2mContext * const s = ctx_to_m2mctx(ctx); -- struct v4l2_event evt = { 0 }; -- int ret; -- -- ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt); -- if (ret < 0) { -- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name); -- return 0; -- } -- -- av_log(logger(ctx), AV_LOG_INFO, "Dq event %d\n", evt.type); -- -- if (evt.type == V4L2_EVENT_EOS) { --// ctx->done = 1; -- av_log(logger(ctx), AV_LOG_TRACE, "%s VIDIOC_EVENT_EOS\n", ctx->name); -- return 0; -- } -- -- if (evt.type != V4L2_EVENT_SOURCE_CHANGE) -- return 0; -- -- s->resize_pending = 1; -- if (!ctx->done) -- return 0; -- -- return do_source_change(s); --} -- - static int v4l2_stop_decode(V4L2Context *ctx) - { - struct v4l2_decoder_cmd cmd = { -@@ -313,243 +288,252 @@ static int v4l2_stop_encode(V4L2Context *ctx) - return 0; - } - --static int count_in_driver(const V4L2Context * const ctx) -+// DQ a buffer -+// Amalgamates all the various ways there are of signalling EOS/Event to -+// generate a consistant EPIPE. -+// -+// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped) -+// -+// Returns: -+// 0 Success -+// AVERROR(EPIPE) Nothing more to read -+// * AVERROR(..) -+ -+ static int -+dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf) - { -- int i; -- int n = 0; -+ V4L2m2mContext * const m = ctx_to_m2mctx(ctx); -+ AVCodecContext * const avctx = m->avctx; -+ V4L2Buffer * avbuf; -+ const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type); - -- if (!ctx->bufrefs) -- return -1; -- -- for (i = 0; i < ctx->num_buffers; ++i) { -- V4L2Buffer *const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -- if (avbuf->status == V4L2BUF_IN_DRIVER) -- ++n; -- } -- return n; --} -+ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; - --static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) --{ -- V4L2m2mContext * const s = ctx_to_m2mctx(ctx); -- const int is_capture = !V4L2_TYPE_IS_OUTPUT(ctx->type); -- struct v4l2_plane planes[VIDEO_MAX_PLANES]; -- struct v4l2_buffer buf = { 0 }; -- V4L2Buffer *avbuf; -- struct pollfd pfd = { -- .events = POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */ -- .fd = ctx_to_m2mctx(ctx)->fd, -+ struct v4l2_buffer buf = { -+ .type = ctx->type, -+ .memory = V4L2_MEMORY_MMAP, - }; -- int i, ret; -- int no_rx_means_done = 0; -- -- if (is_capture && ctx->bufrefs) { -- for (i = 0; i < ctx->num_buffers; i++) { -- avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -- if (avbuf->status == V4L2BUF_IN_DRIVER) -- break; -- } -- if (i == ctx->num_buffers) -- av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers (%d) returned to " -- "userspace. Increase num_capture_buffers " -- "to prevent device deadlock or dropped " -- "packets/frames.\n", i); -+ -+ *ppavbuf = NULL; -+ -+ if (ctx->flag_last) -+ return AVERROR(EPIPE); -+ -+ if (is_mp) { -+ buf.length = VIDEO_MAX_PLANES; -+ buf.m.planes = planes; - } - --#if 0 -- // I think this is true but pointless -- // we will get some other form of EOF signal -- -- /* if we are draining and there are no more capture buffers queued in the driver we are done */ -- if (is_capture && ctx_to_m2mctx(ctx)->draining) { -- for (i = 0; i < ctx->num_buffers; i++) { -- /* capture buffer initialization happens during decode hence -- * detection happens at runtime -- */ -- if (!ctx->bufrefs) -- break; -- -- avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -- if (avbuf->status == V4L2BUF_IN_DRIVER) -- goto start; -+ while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) { -+ const int err = errno; -+ av_assert0(AVERROR(err) < 0); -+ if (err != EINTR) { -+ av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", -+ ctx->name, av_err2str(AVERROR(err))); -+ -+ if (err == EPIPE) -+ ctx->flag_last = 1; -+ -+ return AVERROR(err); - } -- ctx->done = 1; -- return NULL; - } --#endif -- --start: -- if (is_capture) { -- /* no need to listen to requests for more input while draining */ -- if (ctx_to_m2mctx(ctx)->draining || timeout > 0) -- pfd.events = POLLIN | POLLRDNORM | POLLPRI; -- } else { -- pfd.events = POLLOUT | POLLWRNORM; -+ atomic_fetch_sub(&ctx->q_count, 1); -+ -+ avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; -+ avbuf->status = V4L2BUF_AVAILABLE; -+ avbuf->buf = buf; -+ if (is_mp) { -+ memcpy(avbuf->planes, planes, sizeof(planes)); -+ avbuf->buf.m.planes = avbuf->planes; - } -- no_rx_means_done = s->resize_pending && is_capture; - -- for (;;) { -- // If we have a resize pending then all buffers should be Qed -- // With a resize pending we should be in drain but evidence suggests -- // that not all decoders do this so poll to clear -- int t2 = no_rx_means_done ? 0 : timeout < 0 ? 3000 : timeout; -- const int e = pfd.events; -- -- ret = poll(&pfd, 1, t2); -+ if (V4L2_TYPE_IS_CAPTURE(ctx->type)) { -+ // Zero length cap buffer return == EOS -+ if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) { -+ av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n"); - -- if (ret > 0) -- break; -+ // Must reQ so we don't leak -+ // May not matter if the next thing we do is release all the -+ // buffers but better to be tidy. -+ ff_v4l2_buffer_enqueue(avbuf); - -- if (ret < 0) { -- int err = errno; -- if (err == EINTR) -- continue; -- av_log(logger(ctx), AV_LOG_ERROR, "=== poll error %d (%s): events=%#x, cap buffers=%d\n", -- err, strerror(err), -- e, count_in_driver(ctx)); -- return NULL; -+ ctx->flag_last = 1; -+ return AVERROR(EPIPE); - } - -- // ret == 0 (timeout) -- if (no_rx_means_done) { -- av_log(logger(ctx), AV_LOG_DEBUG, "Ctx done on timeout\n"); -- ret = ctx_done(ctx); -- if (ret > 0) -- goto start; -- } -- if (timeout == -1) -- av_log(logger(ctx), AV_LOG_ERROR, "=== poll unexpected TIMEOUT: events=%#x, cap buffers=%d\n", e, count_in_driver(ctx));; -- return NULL; -+#ifdef V4L2_BUF_FLAG_LAST -+ // If flag_last set then this contains data but is the last frame -+ // so remember that but return OK -+ if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0) -+ ctx->flag_last = 1; -+#endif - } - -- /* 0. handle errors */ -- if (pfd.revents & POLLERR) { -- /* if we are trying to get free buffers but none have been queued yet -- no need to raise a warning */ -- if (timeout == 0) { -- for (i = 0; i < ctx->num_buffers; i++) { -- avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -- if (avbuf->status != V4L2BUF_AVAILABLE) -- av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); -- } -- } -- else -- av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); -+ *ppavbuf = avbuf; -+ return 0; -+} - -- return NULL; -- } -+/** -+ * handle resolution change event and end of stream event -+ * Expects to be called after the stream has stopped -+ * -+ * returns 1 if reinit was successful, negative if it failed -+ * returns 0 if reinit was not executed -+ */ -+static int -+get_event(V4L2m2mContext * const m) -+{ -+ AVCodecContext * const avctx = m->avctx; -+ struct v4l2_event evt = { 0 }; - -- /* 1. handle resolution changes */ -- if (pfd.revents & POLLPRI) { -- ret = v4l2_handle_event(ctx); -- if (ret < 0) { -- /* if re-init failed, abort */ -- ctx->done = 1; -- return NULL; -+ while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) { -+ const int rv = AVERROR(errno); -+ if (rv == AVERROR(EINTR)) -+ continue; -+ if (rv == AVERROR(EAGAIN)) { -+ av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n"); -+ return AVERROR_EOF; - } -- if (ret > 0) -- goto start; -+ av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv)); -+ return rv; -+ } -+ -+ av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type); -+ -+ if (evt.type == V4L2_EVENT_EOS) { -+ av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n"); -+ return AVERROR_EOF; - } - -- /* 2. dequeue the buffer */ -- if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) { -+ if (evt.type == V4L2_EVENT_SOURCE_CHANGE) -+ return do_source_change(m); - -- if (is_capture) { -- /* there is a capture buffer ready */ -- if (pfd.revents & (POLLIN | POLLRDNORM)) -- goto dequeue; -+ return 0; -+} - -- // CAPTURE Q drained -- if (no_rx_means_done) { -- if (ctx_done(ctx) > 0) -- goto start; -- return NULL; -- } - -- /* the driver is ready to accept more input; instead of waiting for the capture -- * buffer to complete we return NULL so input can proceed (we are single threaded) -- */ -- if (pfd.revents & (POLLOUT | POLLWRNORM)) -- return NULL; -+// Get a buffer -+// If output then just gets the buffer in the expected way -+// If capture then runs the capture state m/c to deal with res change etc. -+// If return value == 0 then *ppavbuf != NULL -+ -+static int -+get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout) -+{ -+ V4L2m2mContext * const m = ctx_to_m2mctx(ctx); -+ AVCodecContext * const avctx = m->avctx; -+ const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type); -+ -+ const unsigned int poll_cap = (POLLIN | POLLRDNORM); -+ const unsigned int poll_out = (POLLOUT | POLLWRNORM); -+ const unsigned int poll_event = POLLPRI; -+ -+ *ppavbuf = NULL; -+ -+ for (;;) { -+ struct pollfd pfd = { -+ .fd = m->fd, -+ // If capture && stream not started then assume we are waiting for the initial event -+ .events = !is_cap ? poll_out : -+ !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap : -+ poll_event, -+ }; -+ int ret; -+ -+ if (ctx->done) { -+ av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name); -+ return AVERROR_EOF; - } - --dequeue: -- memset(&buf, 0, sizeof(buf)); -- buf.memory = V4L2_MEMORY_MMAP; -- buf.type = ctx->type; -- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { -- memset(planes, 0, sizeof(planes)); -- buf.length = VIDEO_MAX_PLANES; -- buf.m.planes = planes; -+ // If capture && timeout == -1 then also wait for rx buffer free -+ if (is_cap && timeout == -1 && m->output.streamon && !m->draining) -+ pfd.events |= poll_out; -+ -+ // If nothing Qed all we will get is POLLERR - avoid that -+ if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) || -+ (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) || -+ (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) { -+ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name); -+ return AVERROR(EAGAIN); - } - -- while ((ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf)) == -1) { -- const int err = errno; -- if (err == EINTR) -+ // Timeout kludged s.t. "forever" eventually gives up & produces logging -+ // If waiting for an event when we have seen a last_frame then we expect -+ // it to be ready already so force a short timeout -+ ret = poll(&pfd, 1, -+ ff_v4l2_ctx_eos(ctx) ? 10 : -+ timeout == -1 ? 3000 : timeout); -+ if (ret < 0) { -+ ret = AVERROR(errno); // Remember errno before logging etc. -+ av_assert0(ret < 0); -+ } -+ -+ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n", -+ ctx->name, ret, timeout, pfd.events, pfd.revents); -+ -+ if (ret < 0) { -+ if (ret == AVERROR(EINTR)) - continue; -- if (err != EAGAIN) { -- // EPIPE on CAPTURE can be used instead of BUF_FLAG_LAST -- if (err != EPIPE || !is_capture) -- av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", -- ctx->name, av_err2str(AVERROR(err))); -- if (ctx_done(ctx) > 0) -- goto start; -+ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret)); -+ return ret; -+ } -+ -+ if (ret == 0) { -+ if (timeout == -1) -+ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events); -+ if (ff_v4l2_ctx_eos(ctx)) { -+ av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name); -+ ret = get_event(m); -+ if (ret < 0) { -+ ctx->done = 1; -+ return ret; -+ } - } -- return NULL; -+ return AVERROR(EAGAIN); - } -- --ctx->q_count; -- av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d field=%d\n", -- ctx->name, buf.index, -- buf.timestamp.tv_sec, buf.timestamp.tv_usec, -- ctx->q_count, ++ctx->dq_count, buf.field); -- -- avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; -- avbuf->status = V4L2BUF_AVAILABLE; -- avbuf->buf = buf; -- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { -- memcpy(avbuf->planes, planes, sizeof(planes)); -- avbuf->buf.m.planes = avbuf->planes; -+ -+ if ((pfd.revents & POLLERR) != 0) { -+ av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name); -+ return AVERROR_UNKNOWN; - } - -- if (ctx_to_m2mctx(ctx)->draining && is_capture) { -- int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ? -- buf.m.planes[0].bytesused : buf.bytesused; -- if (bytesused == 0) { -- av_log(logger(ctx), AV_LOG_DEBUG, "Buffer empty - reQ\n"); -+ if ((pfd.revents & poll_event) != 0) { -+ ret = get_event(m); -+ if (ret < 0) { -+ ctx->done = 1; -+ return ret; -+ } -+ continue; -+ } - -- // Must reQ so we don't leak -- // May not matter if the next thing we do is release all the -- // buffers but better to be tidy. -- ff_v4l2_buffer_enqueue(avbuf); -+ if ((pfd.revents & poll_cap) != 0) { -+ ret = dq_buf(ctx, ppavbuf); -+ if (ret == AVERROR(EPIPE)) -+ continue; -+ return ret; -+ } - -- if (ctx_done(ctx) > 0) -- goto start; -- return NULL; -- } --#ifdef V4L2_BUF_FLAG_LAST -- if (buf.flags & V4L2_BUF_FLAG_LAST) { -- av_log(logger(ctx), AV_LOG_TRACE, "FLAG_LAST set\n"); -- avbuf->status = V4L2BUF_IN_USE; // Avoid flushing this buffer -- ctx_done(ctx); -- } --#endif -+ if ((pfd.revents & poll_out) != 0) { -+ if (is_cap) -+ return AVERROR(EAGAIN); -+ return dq_buf(ctx, ppavbuf); - } - -- return avbuf; -+ av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents); -+ return AVERROR_UNKNOWN; - } -- -- return NULL; - } - - static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) - { -- int timeout = 0; /* return when no more buffers to dequeue */ - int i; - - /* get back as many output buffers as possible */ - if (V4L2_TYPE_IS_OUTPUT(ctx->type)) { -- do { -- } while (v4l2_dequeue_v4l2buf(ctx, timeout)); -+ V4L2Buffer * avbuf; -+ do { -+ get_qbuf(ctx, &avbuf, 0); -+ } while (avbuf); - } - - for (i = 0; i < ctx->num_buffers; i++) { -@@ -722,7 +706,7 @@ static void flush_all_buffers_status(V4L2Context* const ctx) - if (buf->status == V4L2BUF_IN_DRIVER) - buf->status = V4L2BUF_AVAILABLE; - } -- ctx->q_count = 0; -+ atomic_store(&ctx->q_count, 0); - } - - static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx) -@@ -755,6 +739,10 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) - int ret; - AVCodecContext * const avctx = logger(ctx); - -+ // Avoid doing anything if there is nothing we can do -+ if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon) -+ return 0; -+ - ff_mutex_lock(&ctx->lock); - - if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type)) -@@ -777,6 +765,9 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) - cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF"); - } - -+ // Both stream off & on effectively clear flag_last -+ ctx->flag_last = 0; -+ - ff_mutex_unlock(&ctx->lock); - - return ret; -@@ -840,19 +831,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, - int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) - { - V4L2Buffer *avbuf; -+ int rv; - -- /* -- * timeout=-1 blocks until: -- * 1. decoded frame available -- * 2. an input buffer is ready to be dequeued -- */ -- avbuf = v4l2_dequeue_v4l2buf(ctx, timeout); -- if (!avbuf) { -- if (ctx->done) -- return AVERROR_EOF; -- -- return AVERROR(EAGAIN); -- } -+ if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) -+ return rv; - - return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); - } -@@ -860,19 +842,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) - int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) - { - V4L2Buffer *avbuf; -+ int rv; - -- /* -- * blocks until: -- * 1. encoded packet available -- * 2. an input buffer ready to be dequeued -- */ -- avbuf = v4l2_dequeue_v4l2buf(ctx, -1); -- if (!avbuf) { -- if (ctx->done) -- return AVERROR_EOF; -- -- return AVERROR(EAGAIN); -- } -+ if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0) -+ return rv; - - return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf); - } -@@ -956,6 +929,8 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers - int ret; - int i; - -+ av_assert0(ctx->bufrefs == NULL); -+ - memset(&req, 0, sizeof(req)); - req.count = req_buffers; - req.memory = V4L2_MEMORY_MMAP; -@@ -1033,8 +1008,8 @@ int ff_v4l2_context_init(V4L2Context* ctx) - hwframes = (AVHWFramesContext*)ctx->frames_ref->data; - hwframes->format = AV_PIX_FMT_DRM_PRIME; - hwframes->sw_format = ctx->av_pix_fmt; -- hwframes->width = ctx->width; -- hwframes->height = ctx->height; -+ hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width; -+ hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height; - ret = av_hwframe_ctx_init(ctx->frames_ref); - if (ret < 0) - goto fail_unref_hwframes; -diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h -index a4176448d5..565858a1ed 100644 ---- a/libavcodec/v4l2_context.h -+++ b/libavcodec/v4l2_context.h -@@ -102,6 +102,8 @@ typedef struct V4L2Context { - */ - int done; - -+ int flag_last; -+ - /** - * PTS rescale not wanted - * If the PTS is just a dummy frame count then rescale is -diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c -index 516e6d9858..e26bd74c3e 100644 ---- a/libavcodec/v4l2_m2m.c -+++ b/libavcodec/v4l2_m2m.c -@@ -235,7 +235,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s) - - /* 5. complete reinit */ - s->draining = 0; -- s->reinit = 0; - - return 0; - } -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index 3f86809623..d71f6b721c 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -84,8 +84,6 @@ typedef struct V4L2m2mContext { - AVCodecContext *avctx; - sem_t refsync; - atomic_uint refcount; -- int reinit; -- int resize_pending; - - /* null frame/packet received */ - int draining; -@@ -180,15 +178,25 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx); - int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx); - - --static inline unsigned int ff_v4l2_get_format_width(struct v4l2_format *fmt) -+static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt) - { - return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; - } - --static inline unsigned int ff_v4l2_get_format_height(struct v4l2_format *fmt) -+static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt) - { - return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; - } - -+static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat; -+} -+ -+static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx) -+{ -+ return ctx->flag_last; -+} -+ - - #endif /* AVCODEC_V4L2_M2M_H */ -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 09ec496351..e4b6569ba5 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -113,9 +113,6 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co - if (ret < 0) - av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n"); - -- if (!s->capture.streamon || ret < 0) -- return ret; -- - ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd); - if (ret < 0) - av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno); -@@ -127,69 +124,12 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co - - static int v4l2_try_start(AVCodecContext *avctx) - { -- V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; -- V4L2Context *const capture = &s->capture; -- struct v4l2_selection selection = { 0 }; -+ V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context; - int ret; - - /* 1. start the output process */ - if ((ret = check_output_streamon(avctx, s)) != 0) - return ret; -- -- if (capture->streamon) -- return 0; -- -- /* 2. get the capture format */ -- capture->format.type = capture->type; -- ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format); -- if (ret) { -- av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n"); -- return ret; -- } -- -- /* 2.1 update the AVCodecContext */ -- capture->av_pix_fmt = -- ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO); -- if (s->output_drm) { -- avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; -- avctx->sw_pix_fmt = capture->av_pix_fmt; -- } -- else -- avctx->pix_fmt = capture->av_pix_fmt; -- -- /* 3. set the crop parameters */ --#if 1 -- selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; -- selection.target = V4L2_SEL_TGT_CROP_DEFAULT; -- ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection); -- av_log(avctx, AV_LOG_INFO, "Post G selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height); --#else -- selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; -- selection.r.height = avctx->coded_height; -- selection.r.width = avctx->coded_width; -- av_log(avctx, AV_LOG_INFO, "Try selection %dx%d\n", avctx->coded_width, avctx->coded_height); -- ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection); -- av_log(avctx, AV_LOG_INFO, "Post S selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height); -- if (1) { -- ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection); -- if (ret) { -- av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n"); -- } else { -- av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height); -- /* update the size of the resulting frame */ -- capture->height = selection.r.height; -- capture->width = selection.r.width; -- } -- } --#endif -- -- /* 5. start the capture process */ -- ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); -- if (ret) { -- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n"); -- return ret; -- } -- - return 0; - } - -@@ -364,7 +304,7 @@ xlat_pending(const xlat_track_t * const x) - } - - static inline int stream_started(const V4L2m2mContext * const s) { -- return s->capture.streamon && s->output.streamon; -+ return s->output.streamon; - } - - #define NQ_OK 0 -@@ -377,6 +317,9 @@ static inline int stream_started(const V4L2m2mContext * const s) { - #define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING) - #define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE) - -+// do_not_get If true then no new packet will be got but status will -+// be set appropriately -+ - // AVERROR_EOF Flushing an already flushed stream - // -ve Error (all errors except EOF are unexpected) - // NQ_OK (0) OK -@@ -386,14 +329,14 @@ static inline int stream_started(const V4L2m2mContext * const s) { - // NQ_DRAINING At EOS, dQ dest until EOS there too - // NQ_DEAD Not running (do not retry, do not attempt capture dQ) - --static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s) -+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get) - { - int ret; - - // If we don't already have a coded packet - get a new one - // We will already have a coded pkt if the output Q was full last time we - // tried to Q it -- if (!s->buf_pkt.size) { -+ if (!s->buf_pkt.size && !do_not_get) { - ret = ff_decode_get_packet(avctx, &s->buf_pkt); - - if (ret == AVERROR(EAGAIN)) { -@@ -435,6 +378,17 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const - xlat_pts_in(avctx, &s->xlat, &s->buf_pkt); - } - -+ if (s->draining) { -+ if (s->buf_pkt.size) { -+ av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n"); -+ av_packet_unref(&s->buf_pkt); -+ } -+ return NQ_DRAINING; -+ } -+ -+ if (!s->buf_pkt.size) -+ return NQ_NONE; -+ - if ((ret = check_output_streamon(avctx, s)) != 0) - return ret; - -@@ -471,7 +425,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const - static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - { - V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; -- int src_rv = NQ_NONE; -+ int src_rv; - int dst_rv = 1; // Non-zero (done), non-negative (error) number - unsigned int i = 0; - -@@ -483,31 +437,40 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - // (a) We don't have a lot of stuff in the buffer already OR - // (b) ... we (think we) do but we've failed to get a frame already OR - // (c) We've dequeued a lot of frames without asking for input -- if (!prefer_dq || i != 0 || s->req_pkt > 2) { -- src_rv = try_enqueue_src(avctx, s); -- -- // If we got a frame last time or we've already tried to get a frame and -- // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN) -- // indicating that we want more input. -- // This should mean that once decode starts we enter a stable state where -- // we alternately ask for input and produce output -- if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY) -- break; -- } -+ src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2)); -+ -+ // If we got a frame last time or we've already tried to get a frame and -+ // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN) -+ // indicating that we want more input. -+ // This should mean that once decode starts we enter a stable state where -+ // we alternately ask for input and produce output -+ if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY) -+ break; - - // Try to get a new frame if - // (a) we haven't already got one AND - // (b) enqueue returned a status indicating that decode should be attempted - if (dst_rv != 0 && TRY_DQ(src_rv)) { -+ // Pick a timeout depending on state -+ const int t = -+ src_rv == NQ_DRAINING ? 300 : -+ prefer_dq ? 5 : -+ src_rv == NQ_Q_FULL ? -1 : 0; -+ - do { - // Dequeue frame will unref any previous contents of frame - // if it returns success so we don't need an explicit unref - // when discarding - // This returns AVERROR(EAGAIN) on timeout or if - // there is room in the input Q and timeout == -1 -- dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, prefer_dq ? 5 : -1); -+ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); - -- if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) -+ if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { -+ av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); -+ dst_rv = AVERROR_EOF; -+ s->capture.done = 1; -+ } -+ else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) - av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", - s->draining, s->capture.done); - else if (dst_rv && dst_rv != AVERROR(EAGAIN)) -@@ -630,8 +593,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - * by the v4l2 driver; this event will trigger a full pipeline reconfig and - * the proper values will be retrieved from the kernel driver. - */ -- output->height = capture->height = avctx->coded_height; -- output->width = capture->width = avctx->coded_width; -+// output->height = capture->height = avctx->coded_height; -+// output->width = capture->width = avctx->coded_width; -+ output->height = capture->height = 0; -+ output->width = capture->width = 0; - - output->av_codec_id = avctx->codec_id; - output->av_pix_fmt = AV_PIX_FMT_NONE; -@@ -703,7 +668,6 @@ static void v4l2_decode_flush(AVCodecContext *avctx) - V4L2m2mContext * const s = priv->context; - V4L2Context * const output = &s->output; - V4L2Context * const capture = &s->capture; -- int ret; - - av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon); - -@@ -711,13 +675,19 @@ static void v4l2_decode_flush(AVCodecContext *avctx) - // states like EOS processing so don't try to optimize out (having got it - // wrong once) - -- ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF); -- if (ret < 0) -- av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret); -+ ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF); - - // Clear any buffered input packet - av_packet_unref(&s->buf_pkt); - -+ // Clear a pending EOS -+ if (ff_v4l2_ctx_eos(capture)) { -+ // Arguably we could delay this but this is easy and doesn't require -+ // thought or extra vars -+ ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF); -+ ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); -+ } -+ - // V4L2 makes no guarantees about whether decoded frames are flushed or not - // so mark all frames we are tracking to be discarded if they appear - xlat_flush(&s->xlat); - -From a2519f7a512edde7433aced70de4464e21805693 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 9 Dec 2021 18:51:00 +0000 -Subject: [PATCH 039/136] Honor result of ff_get_format if possible - ---- - libavcodec/v4l2_m2m_dec.c | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index e4b6569ba5..c9655bcc3b 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -615,15 +615,19 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - * check the v4l2_get_drm_frame function. - */ - -+ avctx->sw_pix_fmt = avctx->pix_fmt; - gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts); - av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n", - avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); - -- s->output_drm = 0; - if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) { - avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; - s->output_drm = 1; - } -+ else { -+ capture->av_pix_fmt = gf_pix_fmt; -+ s->output_drm = 0; -+ } - - s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM); - if (!s->device_ref) { - -From a1cd1cb98e48c631392b385ccac5ab7b09bb5ee9 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 14 Dec 2021 16:11:10 +0000 -Subject: [PATCH 040/136] Add an always-reinit quirk - ---- - libavcodec/v4l2_context.c | 7 +++++-- - libavcodec/v4l2_m2m.h | 5 +++++ - libavcodec/v4l2_m2m_dec.c | 33 ++++++++++++++++++++++++++++++++- - 3 files changed, 42 insertions(+), 3 deletions(-) - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index d765181645..c11b5e6863 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -188,6 +188,9 @@ static int do_source_change(V4L2m2mContext * const s) - get_default_selection(&s->capture, &s->capture.selection); - - reinit = ctx_resolution_changed(&s->capture, &cap_fmt); -+ if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0) -+ reinit = 1; -+ - s->capture.format = cap_fmt; - if (reinit) { - s->capture.height = ff_v4l2_get_format_height(&cap_fmt); -@@ -202,10 +205,10 @@ static int do_source_change(V4L2m2mContext * const s) - - s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); - -- av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n", -+ av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d, reinit=%d\n", - s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den, - s->capture.selection.width, s->capture.selection.height, -- s->capture.selection.left, s->capture.selection.top); -+ s->capture.selection.left, s->capture.selection.top, reinit); - - if (reinit) { - if (avctx) -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index d71f6b721c..f1923bb26d 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -113,6 +113,11 @@ typedef struct V4L2m2mContext { - - /* Ext data sent */ - int extdata_sent; -+ -+#define FF_V4L2_QUIRK_REINIT_ALWAYS 1 -+ /* Quirks */ -+ unsigned int quirks; -+ - } V4L2m2mContext; - - typedef struct V4L2m2mPriv { -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index c9655bcc3b..e2b10f5e3a 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -540,6 +540,34 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - } - #endif - -+static int -+get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s) -+{ -+ struct v4l2_capability cap; -+ -+ memset(&cap, 0, sizeof(cap)); -+ while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) { -+ int err = errno; -+ if (err == EINTR) -+ continue; -+ av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err)); -+ return AVERROR(err); -+ } -+ -+ // Could be made table driven if we have a few more but right now there -+ // seems no point -+ -+ // Meson (amlogic) always gives a resolution changed event after output -+ // streamon and userspace must (re)allocate capture buffers and streamon -+ // capture to clear the event even if the capture buffers were the right -+ // size in the first place. -+ if (strcmp(cap.driver, "meson-vdec") == 0) -+ s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS; -+ -+ av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks); -+ return 0; -+} -+ - // This heuristic is for H264 but use for everything - static uint32_t max_coded_size(const AVCodecContext * const avctx) - { -@@ -646,7 +674,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - return ret; - } - -- return v4l2_prepare_decoder(s); -+ if ((ret = v4l2_prepare_decoder(s)) < 0) -+ return ret; -+ -+ return get_quirks(avctx, s); - } - - static av_cold int v4l2_decode_close(AVCodecContext *avctx) - -From 2470968adf0d28bbaf310e782720dd00d57d7bf6 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 4 Jan 2022 16:58:31 +0000 -Subject: [PATCH 041/136] v4l2_buffers: rework flags for keyframe - -Previously flags could become confused and keyframe info could be lost. -This fixes that and removes the duplicate flags field in V4L2Buffer. ---- - libavcodec/v4l2_buffers.c | 15 ++++++++++----- - libavcodec/v4l2_buffers.h | 1 - - libavcodec/v4l2_context.c | 18 +++++++++++++++++- - 3 files changed, 27 insertions(+), 7 deletions(-) - -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 2cf7be6632..62d1c26053 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -680,7 +680,9 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) - - int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) - { -- out->buf.flags = frame->key_frame ? (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME) : (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME); -+ out->buf.flags = frame->key_frame ? -+ (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : -+ (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME); - // Beware that colour info is held in format rather than the actual - // v4l2 buffer struct so this may not be as useful as you might hope - v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc); -@@ -706,6 +708,10 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) - - /* 2. get frame information */ - frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME); -+ frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I : -+ (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P : -+ (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B : -+ AV_PICTURE_TYPE_NONE; - frame->color_primaries = v4l2_get_color_primaries(avbuf); - frame->colorspace = v4l2_get_color_space(avbuf); - frame->color_range = v4l2_get_color_range(avbuf); -@@ -779,8 +785,9 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, - - v4l2_set_pts(out, pkt->pts); - -- if (pkt->flags & AV_PKT_FLAG_KEY) -- out->flags = V4L2_BUF_FLAG_KEYFRAME; -+ out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ? -+ (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : -+ (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME); - - return ret; - } -@@ -924,8 +931,6 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) - int ret; - int qc; - -- avbuf->buf.flags = avbuf->flags; -- - if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) { - av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", - avbuf->context->name, avbuf->buf.index, -diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h -index 641e0e147b..3b7ca4d99e 100644 ---- a/libavcodec/v4l2_buffers.h -+++ b/libavcodec/v4l2_buffers.h -@@ -73,7 +73,6 @@ typedef struct V4L2Buffer { - struct v4l2_buffer buf; - struct v4l2_plane planes[VIDEO_MAX_PLANES]; - -- int flags; - enum V4L2Buffer_status status; - - } V4L2Buffer; -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index c11b5e6863..53b522d43e 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -527,6 +527,22 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout - } - } - -+// Clear out flags and timestamps that should should be set by the user -+// Returns the passed avbuf -+static V4L2Buffer * -+clean_v4l2_buffer(V4L2Buffer * const avbuf) -+{ -+ struct v4l2_buffer *const buf = &avbuf->buf; -+ -+ buf->flags = 0; -+ buf->field = V4L2_FIELD_ANY; -+ buf->timestamp = (struct timeval){0}; -+ buf->timecode = (struct v4l2_timecode){0}; -+ buf->sequence = 0; -+ -+ return avbuf; -+} -+ - static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) - { - int i; -@@ -542,7 +558,7 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) - for (i = 0; i < ctx->num_buffers; i++) { - V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; - if (avbuf->status == V4L2BUF_AVAILABLE) -- return avbuf; -+ return clean_v4l2_buffer(avbuf); - } - - return NULL; - -From 5dc38f5d088beea4da57e82969643cc831c40cf0 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 22 Mar 2022 11:44:30 +0000 -Subject: [PATCH 042/136] v4l2m2m: Rework decode to wait for missing buffer, - add dynamic pending - -Previously receive_frame exited with EAGAIN if no capture buffer -availble in the Q. Now it waits in the hope that another thread will -post one. - -The prefer dQ logic is now dynamic to help with cases where PTS/DTS -lies. If it looks like we are never getting a frame then the -threshold is increased. It then slowly decays over time to cope with -false alarms. ---- - libavcodec/v4l2_buffers.c | 6 +++-- - libavcodec/v4l2_context.c | 7 +++-- - libavcodec/v4l2_context.h | 3 +++ - libavcodec/v4l2_m2m.h | 2 ++ - libavcodec/v4l2_m2m_dec.c | 57 +++++++++++++++++++++++++++++++++++++-- - 5 files changed, 69 insertions(+), 6 deletions(-) - -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 62d1c26053..8c4f18dbed 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -947,12 +947,14 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) - return AVERROR(err); - } - -+ // Lock not wanted - if called from buffer free then lock already obtained - qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1; -+ avbuf->status = V4L2BUF_IN_DRIVER; -+ pthread_cond_broadcast(&avbuf->context->cond); -+ - av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", - avbuf->context->name, avbuf->buf.index, - avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc); - -- avbuf->status = V4L2BUF_IN_DRIVER; -- - return 0; - } -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index 53b522d43e..7ddb759810 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -300,6 +300,7 @@ static int v4l2_stop_encode(V4L2Context *ctx) - // Returns: - // 0 Success - // AVERROR(EPIPE) Nothing more to read -+// AVERROR(ENOSPC) No buffers in Q to put result in - // * AVERROR(..) - - static int -@@ -457,7 +458,7 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout - (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) || - (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) { - av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name); -- return AVERROR(EAGAIN); -+ return AVERROR(ENOSPC); - } - - // Timeout kludged s.t. "forever" eventually gives up & produces logging -@@ -864,7 +865,7 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) - int rv; - - if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0) -- return rv; -+ return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC - - return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf); - } -@@ -938,6 +939,7 @@ void ff_v4l2_context_release(V4L2Context* ctx) - av_buffer_unref(&ctx->frames_ref); - - ff_mutex_destroy(&ctx->lock); -+ pthread_cond_destroy(&ctx->cond); - } - - -@@ -1013,6 +1015,7 @@ int ff_v4l2_context_init(V4L2Context* ctx) - } - - ff_mutex_init(&ctx->lock, NULL); -+ pthread_cond_init(&ctx->cond, NULL); - atomic_init(&ctx->q_count, 0); - - if (s->output_drm) { -diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h -index 565858a1ed..0efff58f18 100644 ---- a/libavcodec/v4l2_context.h -+++ b/libavcodec/v4l2_context.h -@@ -116,6 +116,7 @@ typedef struct V4L2Context { - struct ff_weak_link_master *wl_master; - - AVMutex lock; -+ pthread_cond_t cond; - } V4L2Context; - - /** -@@ -182,6 +183,8 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); - * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds) - * - * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. -+ * AVERROR(ENOSPC) if no buffer availible to put -+ * the frame in - */ - int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); - -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index f1923bb26d..9a20447030 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -105,6 +105,8 @@ typedef struct V4L2m2mContext { - - /* Frame tracking */ - xlat_track_t xlat; -+ int pending_hw; -+ int pending_n; - - pts_stats_t pts_stat; - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index e2b10f5e3a..2e30449dfc 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -251,7 +251,8 @@ xlat_pts_out(AVCodecContext *const avctx, - - frame->best_effort_timestamp = pts_stats_guess(ps); - frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? -- av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts); -+ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n", -+ frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n); - return 0; - } - -@@ -422,6 +423,36 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const - return ret; - } - -+static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx) -+{ -+ int rv = 0; -+ -+ ff_mutex_lock(&ctx->lock); -+ -+ while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) { -+ if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) { -+ rv = AVERROR(errno); -+ av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv)); -+ break; -+ } -+ } -+ -+ ff_mutex_unlock(&ctx->lock); -+ return rv; -+} -+ -+// Number of frames over what xlat_pending returns that we keep *16 -+// This is a min value - if it appears to be too small the threshold should -+// adjust dynamically. -+#define PENDING_HW_MIN (3 * 16) -+// Offset to use when setting dynamically -+// Set to %16 == 15 to avoid the threshold changing immediately as we relax -+#define PENDING_HW_OFFSET (PENDING_HW_MIN - 1) -+// Number of consecutive times we've failed to get a frame when we prefer it -+// before we increase the prefer threshold (5ms * N = max expected decode -+// time) -+#define PENDING_N_THRESHOLD 6 -+ - static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - { - V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; -@@ -431,7 +462,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - - do { - const int pending = xlat_pending(&s->xlat); -- const int prefer_dq = (pending > 5); -+ const int prefer_dq = (pending > s->pending_hw / 16); - - // Enqueue another pkt for decode if - // (a) We don't have a lot of stuff in the buffer already OR -@@ -465,6 +496,27 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - // there is room in the input Q and timeout == -1 - dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); - -+ // Failure due to no buffer in Q? -+ if (dst_rv == AVERROR(ENOSPC)) { -+ // Wait & retry -+ if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) { -+ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); -+ } -+ } -+ -+ // Adjust dynamic pending threshold -+ if (dst_rv == 0) { -+ if (--s->pending_hw < PENDING_HW_MIN) -+ s->pending_hw = PENDING_HW_MIN; -+ s->pending_n = 0; -+ } -+ else if (dst_rv == AVERROR(EAGAIN)) { -+ if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) { -+ s->pending_hw = pending * 16 + PENDING_HW_OFFSET; -+ s->pending_n = 0; -+ } -+ } -+ - if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { - av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); - dst_rv = AVERROR_EOF; -@@ -613,6 +665,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - - xlat_init(&s->xlat); - pts_stats_init(&s->pts_stat, avctx, "decoder"); -+ s->pending_hw = PENDING_HW_MIN; - - capture = &s->capture; - output = &s->output; - -From 33765b769b4301e03f31b65e225fcdb0eff4c0e4 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Fri, 25 Mar 2022 15:37:58 +0000 -Subject: [PATCH 043/136] v4l2_m2m2_dec: Avoid loop if unable to resize buffers - -If source change signals a buffer size that cannot be honored give up -rather than looping indefinitely. This happens on Pi if (say) a -2560x1440 h264 stream is presented to the decode. ---- - libavcodec/v4l2_context.c | 13 +++++++++++-- - 1 file changed, 11 insertions(+), 2 deletions(-) - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index 7ddb759810..007a58c8f1 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -205,8 +205,9 @@ static int do_source_change(V4L2m2mContext * const s) - - s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); - -- av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d, reinit=%d\n", -+ av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n", - s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den, -+ s->capture.width, s->capture.height, - s->capture.selection.width, s->capture.selection.height, - s->capture.selection.left, s->capture.selection.top, reinit); - -@@ -224,9 +225,17 @@ static int do_source_change(V4L2m2mContext * const s) - return AVERROR(EINVAL); - } - -+ if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) || -+ s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) { -+ av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n", -+ s->capture.width, s->capture.height, -+ ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format)); -+ return AVERROR(EINVAL); -+ } -+ - // Update pixel format - should only actually do something on initial change - s->capture.av_pix_fmt = -- ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO); -+ ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO); - if (s->output_drm) { - avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; - avctx->sw_pix_fmt = s->capture.av_pix_fmt; - -From bb7ad2392ce83149a1ba40ecacb36e051b6bf785 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Fri, 25 Mar 2022 18:14:40 +0000 -Subject: [PATCH 044/136] v4l2dec: Improve size/format validation on init - ---- - libavcodec/v4l2_m2m_dec.c | 84 ++++++++++++++++++++++++++++++++-- - libavcodec/v4l2_request_hevc.c | 11 +++++ - 2 files changed, 92 insertions(+), 3 deletions(-) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 2e30449dfc..8dcadf461b 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -592,6 +592,76 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - } - #endif - -+static int -+check_size(AVCodecContext * const avctx, V4L2m2mContext * const s) -+{ -+ unsigned int i; -+ const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format); -+ const uint32_t w = avctx->coded_width; -+ const uint32_t h = avctx->coded_height; -+ -+ if (w == 0 || h == 0 || fcc == 0) { -+ av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc)); -+ return 0; -+ } -+ -+ for (i = 0;; ++i) { -+ struct v4l2_frmsizeenum fs = { -+ .index = i, -+ .pixel_format = fcc, -+ }; -+ -+ while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) { -+ const int err = AVERROR(errno); -+ if (err == AVERROR(EINTR)) -+ continue; -+ if (i == 0 && err == AVERROR(ENOTTY)) { -+ av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n"); -+ return 0; -+ } -+ if (err != AVERROR(EINVAL)) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err)); -+ return err; -+ } -+ av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in frame size enums\n", -+ w, h, av_fourcc2str(fcc)); -+ return err; -+ } -+ -+ switch (fs.type) { -+ case V4L2_FRMSIZE_TYPE_DISCRETE: -+ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i, -+ fs.discrete.width,fs.discrete.height); -+ if (w == fs.discrete.width && h == fs.discrete.height) -+ return 0; -+ break; -+ case V4L2_FRMSIZE_TYPE_STEPWISE: -+ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i, -+ fs.stepwise.min_width, fs.stepwise.min_height, -+ fs.stepwise.max_width, fs.stepwise.max_height, -+ fs.stepwise.step_width,fs.stepwise.step_height); -+ if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width && -+ h >= fs.stepwise.min_height && h <= fs.stepwise.max_height && -+ (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 && -+ (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0) -+ return 0; -+ break; -+ case V4L2_FRMSIZE_TYPE_CONTINUOUS: -+ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i, -+ fs.stepwise.min_width, fs.stepwise.min_height, -+ fs.stepwise.max_width, fs.stepwise.max_height, -+ fs.stepwise.step_width,fs.stepwise.step_height); -+ if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width && -+ h >= fs.stepwise.min_height && h <= fs.stepwise.max_height) -+ return 0; -+ break; -+ default: -+ av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type); -+ return AVERROR(EINVAL); -+ } -+ } -+} -+ - static int - get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s) - { -@@ -698,8 +768,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - - avctx->sw_pix_fmt = avctx->pix_fmt; - gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts); -- av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n", -- avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); -+ av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n", -+ avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), -+ avctx->coded_width, avctx->coded_height, -+ gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); - - if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) { - avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; -@@ -730,7 +802,13 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - if ((ret = v4l2_prepare_decoder(s)) < 0) - return ret; - -- return get_quirks(avctx, s); -+ if ((ret = get_quirks(avctx, s)) != 0) -+ return ret; -+ -+ if ((ret = check_size(avctx, s)) != 0) -+ return ret; -+ -+ return 0; - } - - static av_cold int v4l2_decode_close(AVCodecContext *avctx) -diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c -index b0a5930844..76ab0916cd 100644 ---- a/libavcodec/v4l2_request_hevc.c -+++ b/libavcodec/v4l2_request_hevc.c -@@ -147,6 +147,17 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) - - av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); - -+ // Give up immediately if this is something that we have no code to deal with -+ if (h->ps.sps->chroma_format_idc != 1) { -+ av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc); -+ return AVERROR_PATCHWELCOME; -+ } -+ if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) || -+ h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) { -+ av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma); -+ return AVERROR_PATCHWELCOME; -+ } -+ - if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) { - av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n"); - return (AVERROR(-ret)); - -From 4646b558c0e45f506578a5a452820f55983abc82 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 13 Apr 2022 16:05:56 +0000 -Subject: [PATCH 045/136] v4l2 stateless hevc: Add another API variation for - linux 5.18 - -This is probably going to be a short lived variation and may end up -being reverted if no release using it ever ends up in the wild. ---- - libavcodec/Makefile | 2 +- - libavcodec/hevc-ctrls-v3.h | 255 +++++++++++++++++++++++++++++++++ - libavcodec/v4l2_req_hevc_v3.c | 3 + - libavcodec/v4l2_req_hevc_vx.c | 17 +++ - libavcodec/v4l2_req_media.c | 15 +- - libavcodec/v4l2_req_media.h | 3 + - libavcodec/v4l2_request_hevc.c | 6 +- - libavcodec/v4l2_request_hevc.h | 1 + - 8 files changed, 295 insertions(+), 7 deletions(-) - create mode 100644 libavcodec/hevc-ctrls-v3.h - create mode 100644 libavcodec/v4l2_req_hevc_v3.c - -diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index e1aa0ba014..2b3c16185d 100644 ---- a/libavcodec/Makefile -+++ b/libavcodec/Makefile -@@ -1000,7 +1000,7 @@ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o - OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o - OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec.o - OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o\ -- v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o -+ v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o - OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o - OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o h265_profile_level.o - OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o -diff --git a/libavcodec/hevc-ctrls-v3.h b/libavcodec/hevc-ctrls-v3.h -new file mode 100644 -index 0000000000..4e35bd583d ---- /dev/null -+++ b/libavcodec/hevc-ctrls-v3.h -@@ -0,0 +1,255 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * These are the HEVC state controls for use with stateless HEVC -+ * codec drivers. -+ * -+ * It turns out that these structs are not stable yet and will undergo -+ * more changes. So keep them private until they are stable and ready to -+ * become part of the official public API. -+ */ -+ -+#ifndef _HEVC_CTRLS_H_ -+#define _HEVC_CTRLS_H_ -+ -+#include -+ -+/* The pixel format isn't stable at the moment and will likely be renamed. */ -+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ -+ -+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008) -+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009) -+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010) -+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011) -+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012) -+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015) -+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016) -+ -+/* enum v4l2_ctrl_type type values */ -+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 -+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 -+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 -+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 -+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124 -+ -+enum v4l2_mpeg_video_hevc_decode_mode { -+ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, -+ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, -+}; -+ -+enum v4l2_mpeg_video_hevc_start_code { -+ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, -+ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, -+}; -+ -+#define V4L2_HEVC_SLICE_TYPE_B 0 -+#define V4L2_HEVC_SLICE_TYPE_P 1 -+#define V4L2_HEVC_SLICE_TYPE_I 2 -+ -+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) -+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) -+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) -+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) -+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) -+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) -+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) -+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) -+ -+/* The controls are not stable at the moment and will likely be reworked. */ -+struct v4l2_ctrl_hevc_sps { -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ -+ __u16 pic_width_in_luma_samples; -+ __u16 pic_height_in_luma_samples; -+ __u8 bit_depth_luma_minus8; -+ __u8 bit_depth_chroma_minus8; -+ __u8 log2_max_pic_order_cnt_lsb_minus4; -+ __u8 sps_max_dec_pic_buffering_minus1; -+ __u8 sps_max_num_reorder_pics; -+ __u8 sps_max_latency_increase_plus1; -+ __u8 log2_min_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_luma_coding_block_size; -+ __u8 log2_min_luma_transform_block_size_minus2; -+ __u8 log2_diff_max_min_luma_transform_block_size; -+ __u8 max_transform_hierarchy_depth_inter; -+ __u8 max_transform_hierarchy_depth_intra; -+ __u8 pcm_sample_bit_depth_luma_minus1; -+ __u8 pcm_sample_bit_depth_chroma_minus1; -+ __u8 log2_min_pcm_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_pcm_luma_coding_block_size; -+ __u8 num_short_term_ref_pic_sets; -+ __u8 num_long_term_ref_pics_sps; -+ __u8 chroma_format_idc; -+ __u8 sps_max_sub_layers_minus1; -+ -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) -+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) -+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) -+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) -+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) -+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) -+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) -+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) -+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) -+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) -+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) -+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) -+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) -+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) -+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) -+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) -+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) -+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) -+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) -+ -+struct v4l2_ctrl_hevc_pps { -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ -+ __u8 num_extra_slice_header_bits; -+ __u8 num_ref_idx_l0_default_active_minus1; -+ __u8 num_ref_idx_l1_default_active_minus1; -+ __s8 init_qp_minus26; -+ __u8 diff_cu_qp_delta_depth; -+ __s8 pps_cb_qp_offset; -+ __s8 pps_cr_qp_offset; -+ __u8 num_tile_columns_minus1; -+ __u8 num_tile_rows_minus1; -+ __u8 column_width_minus1[20]; -+ __u8 row_height_minus1[22]; -+ __s8 pps_beta_offset_div2; -+ __s8 pps_tc_offset_div2; -+ __u8 log2_parallel_merge_level_minus2; -+ -+ __u8 padding[4]; -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01 -+ -+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 -+ -+struct v4l2_hevc_dpb_entry { -+ __u64 timestamp; -+ __u8 flags; -+ __u8 field_pic; -+ __u16 pic_order_cnt[2]; -+ __u8 padding[2]; -+}; -+ -+struct v4l2_hevc_pred_weight_table { -+ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __u8 padding[6]; -+ -+ __u8 luma_log2_weight_denom; -+ __s8 delta_chroma_log2_weight_denom; -+}; -+ -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) -+ -+struct v4l2_ctrl_hevc_slice_params { -+ __u32 bit_size; -+ __u32 data_bit_offset; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u32 slice_segment_addr; -+ __u32 num_entry_point_offsets; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ -+ __u8 nal_unit_type; -+ __u8 nuh_temporal_id_plus1; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u8 slice_type; -+ __u8 colour_plane_id; -+ __u16 slice_pic_order_cnt; -+ __u8 num_ref_idx_l0_active_minus1; -+ __u8 num_ref_idx_l1_active_minus1; -+ __u8 collocated_ref_idx; -+ __u8 five_minus_max_num_merge_cand; -+ __s8 slice_qp_delta; -+ __s8 slice_cb_qp_offset; -+ __s8 slice_cr_qp_offset; -+ __s8 slice_act_y_qp_offset; -+ __s8 slice_act_cb_qp_offset; -+ __s8 slice_act_cr_qp_offset; -+ __s8 slice_beta_offset_div2; -+ __s8 slice_tc_offset_div2; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ -+ __u8 pic_struct; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ -+ __u8 padding[5]; -+ -+ __u32 entry_point_offset_minus1[256]; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ -+ struct v4l2_hevc_pred_weight_table pred_weight_table; -+ -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 -+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 -+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 -+ -+struct v4l2_ctrl_hevc_decode_params { -+ __s32 pic_order_cnt_val; -+ __u8 num_active_dpb_entries; -+ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 num_poc_st_curr_before; -+ __u8 num_poc_st_curr_after; -+ __u8 num_poc_lt_curr; -+ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u64 flags; -+}; -+ -+struct v4l2_ctrl_hevc_scaling_matrix { -+ __u8 scaling_list_4x4[6][16]; -+ __u8 scaling_list_8x8[6][64]; -+ __u8 scaling_list_16x16[6][64]; -+ __u8 scaling_list_32x32[2][64]; -+ __u8 scaling_list_dc_coef_16x16[6]; -+ __u8 scaling_list_dc_coef_32x32[2]; -+}; -+ -+/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */ -+#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200) -+/* -+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP - -+ * the number of data (in bits) to skip in the -+ * slice segment header. -+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag" -+ * to before syntax element "slice_temporal_mvp_enabled_flag". -+ * If IDR, the skipped bits are just "pic_output_flag" -+ * (separate_colour_plane_flag is not supported). -+ */ -+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0) -+ -+#endif -diff --git a/libavcodec/v4l2_req_hevc_v3.c b/libavcodec/v4l2_req_hevc_v3.c -new file mode 100644 -index 0000000000..dcc8d95632 ---- /dev/null -+++ b/libavcodec/v4l2_req_hevc_v3.c -@@ -0,0 +1,3 @@ -+#define HEVC_CTRLS_VERSION 3 -+#include "v4l2_req_hevc_vx.c" -+ -diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c -index 0ae03b10c4..611fa21cc3 100644 ---- a/libavcodec/v4l2_req_hevc_vx.c -+++ b/libavcodec/v4l2_req_hevc_vx.c -@@ -16,6 +16,8 @@ - - #elif HEVC_CTRLS_VERSION == 2 - #include "hevc-ctrls-v2.h" -+#elif HEVC_CTRLS_VERSION == 3 -+#include "hevc-ctrls-v3.h" - #else - #error Unknown HEVC_CTRLS_VERSION - #endif -@@ -147,6 +149,7 @@ static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_t - } - } - -+#if HEVC_CTRLS_VERSION <= 2 - static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp) - { - const HEVCFrame *frame; -@@ -172,6 +175,7 @@ static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp) - - return 0; - } -+#endif - - static unsigned int - get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame, -@@ -247,7 +251,12 @@ fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const - struct v4l2_hevc_dpb_entry * const entry = entries + n++; - - entry->timestamp = frame_capture_dpb(frame->frame); -+#if HEVC_CTRLS_VERSION <= 2 - entry->rps = find_frame_rps_type(h, entry->timestamp); -+#else -+ entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 : -+ V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE; -+#endif - entry->field_pic = frame->frame->interlaced_frame; - - /* TODO: Interleaved: Get the POC for each field. */ -@@ -1011,6 +1020,14 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - }; - const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc); - -+#if HEVC_CTRLS_VERSION == 2 -+ if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0)) -+ return AVERROR(EINVAL); -+#elif HEVC_CTRLS_VERSION == 3 -+ if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0)) -+ return AVERROR(EINVAL); -+#endif -+ - if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) { - av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION); - return AVERROR(EINVAL); -diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c -index eb00ecb406..980b306b8a 100644 ---- a/libavcodec/v4l2_req_media.c -+++ b/libavcodec/v4l2_req_media.c -@@ -604,6 +604,7 @@ struct mediabufs_ctl { - - struct v4l2_format src_fmt; - struct v4l2_format dst_fmt; -+ struct v4l2_capability capability; - }; - - static int qe_v4l2_queue(struct qent_base *const be, -@@ -1498,20 +1499,24 @@ void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc) - mediabufs_ctl_delete(mbc); - } - -+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc) -+{ -+ return mbc->capability.version; -+} -+ - static int set_capabilities(struct mediabufs_ctl *const mbc) - { -- struct v4l2_capability capability = { 0 }; - uint32_t caps; - -- if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &capability)) { -+ if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) { - int err = errno; - request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err)); - return -err; - } - -- caps = (capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ? -- capability.device_caps : -- capability.capabilities; -+ caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ? -+ mbc->capability.device_caps : -+ mbc->capability.capabilities; - - if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) { - mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; -diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h -index 2f826cfb14..0307a831de 100644 ---- a/libavcodec/v4l2_req_media.h -+++ b/libavcodec/v4l2_req_media.h -@@ -142,6 +142,9 @@ MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw, - struct dmabufs_ctl * const dbsc, - unsigned int n); - -+#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c)) -+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc); -+ - struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, - const char *vpath, struct pollqueue *const pq); - void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc); -diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c -index 76ab0916cd..20e4e0ab15 100644 ---- a/libavcodec/v4l2_request_hevc.c -+++ b/libavcodec/v4l2_request_hevc.c -@@ -210,7 +210,11 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) - goto fail4; - } - -- if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) { -+ if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) { -+ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n"); -+ ctx->fns = &V2(ff_v4l2_req_hevc, 3); -+ } -+ else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) { - av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n"); - ctx->fns = &V2(ff_v4l2_req_hevc, 2); - } -diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h -index f14f594564..ed48d62e2d 100644 ---- a/libavcodec/v4l2_request_hevc.h -+++ b/libavcodec/v4l2_request_hevc.h -@@ -98,5 +98,6 @@ typedef struct v4l2_req_decode_fns { - - extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1); - extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2); -+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3); - - #endif - -From 92160173e701aa7e2f1011e63596e48d15e691a9 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 3 May 2022 12:44:42 +0000 -Subject: [PATCH 046/136] Remove V4l2 frame size check for meson-vdec - ---- - libavcodec/v4l2_m2m.h | 3 ++- - libavcodec/v4l2_m2m_dec.c | 10 +++++++--- - 2 files changed, 9 insertions(+), 4 deletions(-) - -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index 9a20447030..6bd5e8eda7 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -116,7 +116,8 @@ typedef struct V4L2m2mContext { - /* Ext data sent */ - int extdata_sent; - --#define FF_V4L2_QUIRK_REINIT_ALWAYS 1 -+#define FF_V4L2_QUIRK_REINIT_ALWAYS 1 -+#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN 2 - /* Quirks */ - unsigned int quirks; - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 8dcadf461b..888ba67fea 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -604,6 +604,10 @@ check_size(AVCodecContext * const avctx, V4L2m2mContext * const s) - av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc)); - return 0; - } -+ if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) { -+ av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc)); -+ return 0; -+ } - - for (i = 0;; ++i) { - struct v4l2_frmsizeenum fs = { -@@ -623,8 +627,8 @@ check_size(AVCodecContext * const avctx, V4L2m2mContext * const s) - av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err)); - return err; - } -- av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in frame size enums\n", -- w, h, av_fourcc2str(fcc)); -+ av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n", -+ w, h, av_fourcc2str(fcc), i); - return err; - } - -@@ -684,7 +688,7 @@ get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s) - // capture to clear the event even if the capture buffers were the right - // size in the first place. - if (strcmp(cap.driver, "meson-vdec") == 0) -- s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS; -+ s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN; - - av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks); - return 0; - -From 8ba5576e7fcd24c2f450f0295cc3b6d8e82e8649 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 23 May 2022 18:05:20 +0100 -Subject: [PATCH 047/136] v4l2m2m_dec: Make some error rturns a bit more robust - ---- - libavcodec/v4l2_context.c | 5 ++--- - libavcodec/v4l2_m2m_dec.c | 23 ++++++++++++++--------- - 2 files changed, 16 insertions(+), 12 deletions(-) - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index 007a58c8f1..b3662aedaa 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -765,7 +765,7 @@ static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx) - int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) - { - int type = ctx->type; -- int ret; -+ int ret = 0; - AVCodecContext * const avctx = logger(ctx); - - // Avoid doing anything if there is nothing we can do -@@ -777,8 +777,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) - if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type)) - stuff_all_buffers(avctx, ctx); - -- ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type); -- if (ret < 0) { -+ if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) { - const int err = errno; - av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name, - cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err); -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 888ba67fea..88a341aae2 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -110,16 +110,21 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co - return 0; - - ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON); -- if (ret < 0) -- av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n"); -- -- ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd); -- if (ret < 0) -- av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno); -- else -- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_DECODER_CMD start OK\n"); -+ if (ret != 0) { -+ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret)); -+ return ret; -+ } - -- return ret; -+ // STREAMON should do implicit START so this just for those that don't. -+ // It is optional so don't worry if it fails -+ if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) { -+ ret = AVERROR(errno); -+ av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret)); -+ } -+ else { -+ av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n"); -+ } -+ return 0; - } - - static int v4l2_try_start(AVCodecContext *avctx) - -From aafa5968f8713319be35cf26069c98566d5bf59b Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 24 May 2022 17:02:58 +0000 -Subject: [PATCH 048/136] v4l2m2m_dec: Support in-pkt AV_PKT_DATA_NEW_EXTRADATA - -Support packet side-data containing AV_PKT_DATA_NEW_EXTRADATA. Should -also detect and complain about unexpected streams of empty packets. - -This functionality untested as I haven't yet found anything that creates -NEW_EXTRADATA side data. ---- - libavcodec/v4l2_m2m.c | 1 + - libavcodec/v4l2_m2m.h | 3 +++ - libavcodec/v4l2_m2m_dec.c | 49 ++++++++++++++++++++++++++++++++++++--- - 3 files changed, 50 insertions(+), 3 deletions(-) - -diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c -index e26bd74c3e..6dd01e2e00 100644 ---- a/libavcodec/v4l2_m2m.c -+++ b/libavcodec/v4l2_m2m.c -@@ -251,6 +251,7 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context) - av_frame_unref(s->frame); - av_frame_free(&s->frame); - av_packet_unref(&s->buf_pkt); -+ av_freep(&s->extdata_data); - - av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n"); - -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index 6bd5e8eda7..19d618698d 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -115,6 +115,9 @@ typedef struct V4L2m2mContext { - - /* Ext data sent */ - int extdata_sent; -+ /* Ext data sent in packet - overrides ctx */ -+ uint8_t * extdata_data; -+ size_t extdata_size; - - #define FF_V4L2_QUIRK_REINIT_ALWAYS 1 - #define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN 2 -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 88a341aae2..392a68f0c7 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -343,7 +343,46 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const - // We will already have a coded pkt if the output Q was full last time we - // tried to Q it - if (!s->buf_pkt.size && !do_not_get) { -- ret = ff_decode_get_packet(avctx, &s->buf_pkt); -+ unsigned int i; -+ -+ for (i = 0; i < 256; ++i) { -+ uint8_t * side_data; -+ size_t side_size; -+ -+ ret = ff_decode_get_packet(avctx, &s->buf_pkt); -+ if (ret != 0) -+ break; -+ -+ // New extradata is the only side-data we undertand -+ side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size); -+ if (side_data) { -+ av_log(avctx, AV_LOG_DEBUG, "New extradata\n"); -+ av_freep(&s->extdata_data); -+ if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to alloc %zd bytes of extra data\n", side_size); -+ return AVERROR(ENOMEM); -+ } -+ memcpy(s->extdata_data, side_data, side_size); -+ s->extdata_size = side_size; -+ s->extdata_sent = 0; -+ } -+ -+ if (s->buf_pkt.size != 0) -+ break; -+ -+ if (s->buf_pkt.side_data_elems == 0) { -+ av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n"); -+ ret = AVERROR_EOF; -+ break; -+ } -+ -+ // Retry a side-data only pkt -+ } -+ // If i >= 256 something has gone wrong -+ if (i >= 256) { -+ av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n"); -+ return AVERROR(EIO); -+ } - - if (ret == AVERROR(EAGAIN)) { - if (!stream_started(s)) { -@@ -398,8 +437,12 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const - if ((ret = check_output_streamon(avctx, s)) != 0) - return ret; - -- ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, -- avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size); -+ if (s->extdata_sent) -+ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); -+ else if (s->extdata_data) -+ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size); -+ else -+ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size); - - if (ret == AVERROR(EAGAIN)) { - // Out of input buffers - keep packet - -From e9bced67bdb40096d31067d41956276e9e1af11a Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 24 May 2022 20:02:48 +0000 -Subject: [PATCH 049/136] v4l2m2m_dec: Catch repeated Q fulls - ---- - libavcodec/v4l2_m2m_dec.c | 8 +++++++- - 1 file changed, 7 insertions(+), 1 deletion(-) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 392a68f0c7..7e17044706 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -504,13 +504,14 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx) - static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - { - V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; -- int src_rv; -+ int src_rv = NQ_OK; - int dst_rv = 1; // Non-zero (done), non-negative (error) number - unsigned int i = 0; - - do { - const int pending = xlat_pending(&s->xlat); - const int prefer_dq = (pending > s->pending_hw / 16); -+ const int last_src_rv = src_rv; - - // Enqueue another pkt for decode if - // (a) We don't have a lot of stuff in the buffer already OR -@@ -526,6 +527,11 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY) - break; - -+ if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) { -+ av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n"); -+ break; -+ } -+ - // Try to get a new frame if - // (a) we haven't already got one AND - // (b) enqueue returned a status indicating that decode should be attempted - -From 0c974e4da2c0311836145f2fd42081d40eb15998 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 25 May 2022 15:22:12 +0000 -Subject: [PATCH 050/136] Remove requirement for epoxy & libudev config options - ---- - configure | 26 +++++++++++++++++--------- - pi-util/conf_native.sh | 2 -- - 2 files changed, 17 insertions(+), 11 deletions(-) - -diff --git a/configure b/configure -index b41663c794..fdc95146bf 100755 ---- a/configure -+++ b/configure -@@ -205,6 +205,7 @@ External library support: - --disable-bzlib disable bzlib [autodetect] - --disable-coreimage disable Apple CoreImage framework [autodetect] - --enable-chromaprint enable audio fingerprinting with chromaprint [no] -+ --disable-epoxy disable epoxy [autodetect] - --enable-frei0r enable frei0r video filtering [no] - --enable-gcrypt enable gcrypt, needed for rtmp(t)e support - if openssl, librtmp or gmp is not used [no] -@@ -281,7 +282,7 @@ External library support: - if openssl, gnutls or mbedtls is not used [no] - --enable-libtwolame enable MP2 encoding via libtwolame [no] - --enable-libuavs3d enable AVS3 decoding via libuavs3d [no] -- --enable-libudev enable libudev [no] -+ --disable-libudev disable libudev [autodetect] - --enable-libv4l2 enable libv4l2/v4l-utils [no] - --enable-libvidstab enable video stabilization using vid.stab [no] - --enable-libvmaf enable vmaf filter via libvmaf [no] -@@ -1747,7 +1748,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST=" - avfoundation - bzlib - coreimage -+ epoxy - iconv -+ libudev - libxcb - libxcb_shm - libxcb_shape -@@ -1819,7 +1822,6 @@ EXTERNAL_LIBRARY_LIST=" - libdav1d - libdc1394 - libdrm -- epoxy - libflite - libfontconfig - libfreetype -@@ -1863,7 +1865,6 @@ EXTERNAL_LIBRARY_LIST=" - libtheora - libtwolame - libuavs3d -- libudev - libv4l2 - libvmaf - libvorbis -@@ -3567,9 +3568,8 @@ v4l2_indev_suggest="libv4l2" - v4l2_outdev_deps="libdrm" - v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h" - v4l2_outdev_suggest="libv4l2" --vout_drm_outdev_deps="libdrm vout_drm" --vout_egl_outdev_deps="xlib" --vout_egl_outdev_select="epoxy" -+vout_drm_outdev_deps="libdrm" -+vout_egl_outdev_deps="xlib epoxy" - vfwcap_indev_deps="vfw32 vfwcap_defines" - xcbgrab_indev_deps="libxcb" - xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes" -@@ -6355,6 +6355,12 @@ if enabled xlib; then - disable xlib - fi - -+enabled libudev && -+ check_pkg_config libudev libudev libudev.h udev_new -+ -+enabled epoxy && -+ check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version -+ - check_headers direct.h - check_headers dirent.h - check_headers dxgidebug.h -@@ -6601,7 +6607,6 @@ enabled libdav1d && require_pkg_config libdav1d "dav1d >= 0.5.0" "dav1d - enabled libdavs2 && require_pkg_config libdavs2 "davs2 >= 1.6.0" davs2.h davs2_decoder_open - enabled libdc1394 && require_pkg_config libdc1394 libdc1394-2 dc1394/dc1394.h dc1394_new - enabled libdrm && require_pkg_config libdrm libdrm xf86drm.h drmGetVersion --enabled epoxy && require_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version - enabled libfdk_aac && { check_pkg_config libfdk_aac fdk-aac "fdk-aac/aacenc_lib.h" aacEncOpen || - { require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-aac && - warn "using libfdk without pkg-config"; } } -@@ -6713,7 +6718,6 @@ enabled libtwolame && require libtwolame twolame.h twolame_init -ltwolame - { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame || - die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; } - enabled libuavs3d && require_pkg_config libuavs3d "uavs3d >= 1.1.41" uavs3d.h uavs3d_decode --enabled libudev && require_pkg_config libudev libudev libudev.h udev_new - enabled libv4l2 && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl - enabled libvidstab && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit - enabled libvmaf && require_pkg_config libvmaf "libvmaf >= 2.0.0" libvmaf.h vmaf_init -@@ -6819,9 +6823,13 @@ enabled rkmpp && { require_pkg_config rkmpp rockchip_mpp rockchip/r - enabled v4l2_request && { enabled libdrm || - die "ERROR: v4l2-request requires --enable-libdrm"; } && - { enabled libudev || -- die "ERROR: v4l2-request requires --enable-libudev"; } -+ die "ERROR: v4l2-request requires libudev"; } - enabled vapoursynth && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init - -+enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; } -+ -+enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } && -+ { enabled xlib || die "ERROR: vout_egl requires xlib"; } - - if enabled gcrypt; then - GCRYPT_CONFIG="${cross_prefix}libgcrypt-config" -diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh -index 65576846e8..37cea71756 100755 ---- a/pi-util/conf_native.sh -+++ b/pi-util/conf_native.sh -@@ -91,8 +91,6 @@ $FFSRC/configure \ - --disable-thumb\ - --enable-v4l2-request\ - --enable-libdrm\ -- --enable-epoxy\ -- --enable-libudev\ - --enable-vout-egl\ - --enable-vout-drm\ - $SHARED_LIBS\ - -From 9f234d8cbde2829e6a70fd3cb6324998df8a31f3 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Fri, 27 May 2022 09:36:51 +0000 -Subject: [PATCH 051/136] hevc: If hwaccel avoid creation of s/w only vars - ---- - libavcodec/hevc_refs.c | 35 +++++++++++++++++++++-------------- - libavcodec/hevcdec.c | 42 +++++++++++++++++++++++++++++------------- - 2 files changed, 50 insertions(+), 27 deletions(-) - -diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c -index 811e8feff8..f7cf14eabc 100644 ---- a/libavcodec/hevc_refs.c -+++ b/libavcodec/hevc_refs.c -@@ -98,18 +98,22 @@ static HEVCFrame *alloc_frame(HEVCContext *s) - if (!frame->rpl_buf) - goto fail; - -- frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool); -- if (!frame->tab_mvf_buf) -- goto fail; -- frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data; -+ if (s->tab_mvf_pool) { -+ frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool); -+ if (!frame->tab_mvf_buf) -+ goto fail; -+ frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data; -+ } - -- frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool); -- if (!frame->rpl_tab_buf) -- goto fail; -- frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data; -- frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height; -- for (j = 0; j < frame->ctb_count; j++) -- frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data; -+ if (s->rpl_tab_pool) { -+ frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool); -+ if (!frame->rpl_tab_buf) -+ goto fail; -+ frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data; -+ frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height; -+ for (j = 0; j < frame->ctb_count; j++) -+ frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data; -+ } - - frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD; - frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD); -@@ -297,14 +301,17 @@ static int init_slice_rpl(HEVCContext *s) - int ctb_count = frame->ctb_count; - int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; - int i; -+ RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx; - - if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab)) - return AVERROR_INVALIDDATA; - -- for (i = ctb_addr_ts; i < ctb_count; i++) -- frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx; -+ if (frame->rpl_tab) { -+ for (i = ctb_addr_ts; i < ctb_count; i++) -+ frame->rpl_tab[i] = tab; -+ } - -- frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts]; -+ frame->refPicList = tab->refPicList; - - return 0; - } -diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c -index 2867cb2e16..17f53322fb 100644 ---- a/libavcodec/hevcdec.c -+++ b/libavcodec/hevcdec.c -@@ -536,6 +536,16 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, - if (!sps) - return 0; - -+ // If hwaccel then we don't need all the s/w decode helper arrays -+ if (s->avctx->hwaccel) { -+ export_stream_params(s, sps); -+ -+ s->avctx->pix_fmt = pix_fmt; -+ s->ps.sps = sps; -+ s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data; -+ return 0; -+ } -+ - ret = pic_arrays_init(s, sps); - if (ret < 0) - goto fail; -@@ -2890,11 +2900,13 @@ static int hevc_frame_start(HEVCContext *s) - ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1); - int ret; - -- memset(s->horizontal_bs, 0, s->bs_width * s->bs_height); -- memset(s->vertical_bs, 0, s->bs_width * s->bs_height); -- memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height); -- memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1)); -- memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address)); -+ if (s->horizontal_bs) { -+ memset(s->horizontal_bs, 0, s->bs_width * s->bs_height); -+ memset(s->vertical_bs, 0, s->bs_width * s->bs_height); -+ memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height); -+ memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1)); -+ memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address)); -+ } - - s->is_decoded = 0; - s->first_nal_type = s->nal_unit_type; -@@ -3438,15 +3450,19 @@ static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src) - dst->needs_fg = 1; - } - -- dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf); -- if (!dst->tab_mvf_buf) -- goto fail; -- dst->tab_mvf = src->tab_mvf; -+ if (src->tab_mvf_buf) { -+ dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf); -+ if (!dst->tab_mvf_buf) -+ goto fail; -+ dst->tab_mvf = src->tab_mvf; -+ } - -- dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf); -- if (!dst->rpl_tab_buf) -- goto fail; -- dst->rpl_tab = src->rpl_tab; -+ if (src->rpl_tab_buf) { -+ dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf); -+ if (!dst->rpl_tab_buf) -+ goto fail; -+ dst->rpl_tab = src->rpl_tab; -+ } - - dst->rpl_buf = av_buffer_ref(src->rpl_buf); - if (!dst->rpl_buf) - -From bb2ddc480634141bed9afd3f66e7f63f5091bb2f Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 30 May 2022 17:51:44 +0100 -Subject: [PATCH 052/136] rpi_sand: Add SAND30->NV12 conversion - -C code only. Reworks the hwcontext_drm conversion to use the -rpi_sand_fns generic frame convert fn rather than calling the -individual conversion functions directly. This keeps all teh stride and -size logic in a single place. ---- - libavutil/hwcontext_drm.c | 46 ++++++++------------ - libavutil/rpi_sand_fns.c | 89 +++++++++++++++++++++++++++++++++++++++ - libavutil/rpi_sand_fns.h | 5 +++ - 3 files changed, 111 insertions(+), 29 deletions(-) - -diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c -index baf18920fa..137a952d2c 100644 ---- a/libavutil/hwcontext_drm.c -+++ b/libavutil/hwcontext_drm.c -@@ -234,14 +234,14 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx, - enum AVHWFrameTransferDirection dir, - enum AVPixelFormat **formats) - { -- enum AVPixelFormat *pix_fmts; -+ enum AVPixelFormat *p; - -- pix_fmts = av_malloc_array(2, sizeof(*pix_fmts)); -- if (!pix_fmts) -+ p = *formats = av_malloc_array(3, sizeof(*p)); -+ if (!p) - return AVERROR(ENOMEM); - - // **** Offer native sand too ???? -- pix_fmts[0] = -+ *p++ = - #if CONFIG_SAND - ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ? - AV_PIX_FMT_YUV420P : -@@ -249,9 +249,14 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx, - AV_PIX_FMT_YUV420P10LE : - #endif - ctx->sw_format; -- pix_fmts[1] = AV_PIX_FMT_NONE; - -- *formats = pix_fmts; -+#if CONFIG_SAND -+ if (ctx->sw_format == AV_PIX_FMT_RPI4_10 || -+ ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128) -+ *p++ = AV_PIX_FMT_NV12; -+#endif -+ -+ *p = AV_PIX_FMT_NONE; - return 0; - } - -@@ -294,29 +299,12 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc, - const unsigned int w = FFMIN(dst->width, map->width); - const unsigned int h = FFMIN(dst->height, map->height); - -- if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) { -- av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], -- map->data[0], -- 128, stride2, -- 0, 0, w, h); -- av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1], -- dst->data[2], dst->linesize[2], -- map->data[1], -- 128, stride2, -- 0, 0, w / 2, h / 2); -- } -- else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) { -- av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0], -- map->data[0], -- 128, stride2, -- 0, 0, w, h); -- av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1], -- dst->data[2], dst->linesize[2], -- map->data[1], -- 128, stride2, -- 0, 0, w / 2, h / 2); -- } -- else -+ map->crop_top = 0; -+ map->crop_bottom = 0; -+ map->crop_left = 0; -+ map->crop_right = 0; -+ -+ if (av_rpi_sand_to_planar_frame(dst, map) != 0) - { - av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__); - err = AVERROR(EINVAL); -diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c -index 1f543e9357..256c3d532f 100644 ---- a/libavutil/rpi_sand_fns.c -+++ b/libavutil/rpi_sand_fns.c -@@ -229,6 +229,75 @@ void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_ - } - } - -+// Fetches a single patch - offscreen fixup not done here -+// w <= stride1 -+// single lose bottom 2 bits truncation -+// _x & _w in pixels, strides in bytes -+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h) -+{ -+ const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word -+ const unsigned int xskip0 = _x - (x0 >> 2) * 3; -+ const unsigned int x1 = ((_x + _w) / 3) * 4; -+ const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3; -+ const unsigned int mask = stride1 - 1; -+ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; -+ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words -+ -+#if HAVE_SAND_ASM && 0 -+ if (_x == 0) { -+ ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); -+ return; -+ } -+#endif -+ -+ if (x0 == x1) { -+ // ******************* -+ // Partial single word xfer -+ return; -+ } -+ -+ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1) -+ { -+ unsigned int x = x0; -+ const uint32_t * p = (const uint32_t *)p0; -+ uint8_t * d = dst; -+ -+ if (xskip0 != 0) { -+ const uint32_t p3 = *p++; -+ -+ if (xskip0 == 1) -+ *d++ = (p3 >> 12) & 0xff; -+ *d++ = (p3 >> 22) & 0xff; -+ -+ if (((x += 4) & mask) == 0) -+ p += slice_inc; -+ } -+ -+ while (x != x1) { -+ const uint32_t p3 = *p++; -+ *d++ = (p3 >> 2) & 0xff; -+ *d++ = (p3 >> 12) & 0xff; -+ *d++ = (p3 >> 22) & 0xff; -+ -+ if (((x += 4) & mask) == 0) -+ p += slice_inc; -+ } -+ -+ if (xrem1 != 0) { -+ const uint32_t p3 = *p; -+ -+ *d++ = (p3 >> 2) & 0xff; -+ if (xrem1 == 2) -+ *d++ = (p3 >> 12) & 0xff; -+ } -+ } -+} -+ -+ - - // w/h in pixels - void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, -@@ -310,6 +379,16 @@ int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src) - av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), - x/2, y/2, w/2, h/2); - break; -+ case AV_PIX_FMT_NV12: -+ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], -+ src->data[0], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x, y, w, h); -+ av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1], -+ src->data[1], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x/2, y/2, w, h/2); -+ break; - default: - return -1; - } -@@ -344,6 +423,16 @@ int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src) - av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), - x/2, y/2, w/2, h/2); - break; -+ case AV_PIX_FMT_NV12: -+ av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0], -+ src->data[0], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x, y, w, h); -+ av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1], -+ src->data[1], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x/2, y/2, w, h/2); -+ break; - default: - return -1; - } -diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h -index 634b55e800..462ccb8abd 100644 ---- a/libavutil/rpi_sand_fns.h -+++ b/libavutil/rpi_sand_fns.h -@@ -85,6 +85,11 @@ void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_ - unsigned int _x, unsigned int y, - unsigned int _w, unsigned int h); - -+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); - - // w/h in pixels - void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, - -From b55c351e6954c800229d97dc6c982ca8f998c848 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 1 Jun 2022 17:49:26 +0000 -Subject: [PATCH 053/136] rpi_sand: Add SAND30->NV12 asm for Armv7 & Armv8 - -Also reworks the previous Armv8 SAND30->Y16 function in a slightly more -efficient way that makes it look more like the Armv7 version. ---- - libavutil/aarch64/rpi_sand_neon.S | 549 ++++++++++++++++++------------ - libavutil/aarch64/rpi_sand_neon.h | 4 + - libavutil/arm/rpi_sand_neon.S | 239 ++++++++++--- - libavutil/arm/rpi_sand_neon.h | 11 + - libavutil/rpi_sand_fns.c | 2 +- - 5 files changed, 541 insertions(+), 264 deletions(-) - -diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S -index cdcf71ee67..2f07d9674c 100644 ---- a/libavutil/aarch64/rpi_sand_neon.S -+++ b/libavutil/aarch64/rpi_sand_neon.S -@@ -248,228 +248,6 @@ incomplete_block_loop_end_c8: - ret - endfunc - --//void ff_rpi_sand30_lines_to_planar_y16( --// uint8_t * dest, // [x0] --// unsigned int dst_stride, // [w1] -> assumed to be equal to _w --// const uint8_t * src, // [x2] --// unsigned int src_stride1, // [w3] -> 128 --// unsigned int src_stride2, // [w4] --// unsigned int _x, // [w5] --// unsigned int y, // [w6] --// unsigned int _w, // [w7] --// unsigned int h); // [sp, #0] -- --function ff_rpi_sand30_lines_to_planar_y16, export=1 -- stp x19, x20, [sp, #-48]! -- stp x21, x22, [sp, #16] -- stp x23, x24, [sp, #32] -- -- // w6 = argument h -- ldr w6, [sp, #48] -- -- // slice_inc = ((stride2 - 1) * stride1) -- mov w5, w4 -- sub w5, w5, #1 -- lsl w5, w5, #7 -- -- // total number of bytes per row = (width / 3) * 4 -- mov w8, w7 -- mov w9, #3 -- udiv w8, w8, w9 -- lsl w8, w8, #2 -- -- // number of full 128 byte blocks to be processed -- mov w9, #96 -- udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96 -- -- // w10 = number of full integers to process (4 bytes) -- // w11 = remaning zero to two 10bit values still to copy over -- mov w12, #96 -- mul w12, w9, w12 -- sub w12, w7, w12 // width - blocks*96 = remaining points per row -- mov w11, #3 -- udiv w10, w12, w11 // full integers to process = w12 / 3 -- mul w11, w10, w11 // #integers *3 -- sub w11, w12, w11 // remaining 0-2 points = remaining points - integers*3 -- -- // increase w9 by one if w10+w11 is not zero, and decrease the row count by one -- // this is to efficiently copy incomplete blocks at the end of the rows -- // the last row is handled explicitly to avoid writing out of bounds -- add w22, w10, w11 -- cmp w22, #0 -- cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise -- add w9, w9, w22 -- sub w6, w6, #1 -- -- // store the number of bytes in w20 which we copy too much for every row -- // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values) -- mov w20, #96*2 -- mul w20, w20, w9 -- sub w20, w1, w20 -- -- mov w23, #0 // flag to check whether the last line had already been processed -- -- // bitmask to clear the uppper 6bits of the result values -- mov x19, #0x03ff03ff03ff03ff -- dup v22.2d, x19 -- -- // row counter = 0 -- eor w12, w12, w12 --row_loop_y16: -- cmp w12, w6 // jump to row_loop_y16_fin if we processed all rows -- bge row_loop_y16_fin -- -- mov x13, x2 // row src -- eor w14, w14, w14 // full block counter --block_loop_y16: -- cmp w14, w9 -- bge block_loop_y16_fin -- -- // load 64 bytes -- ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64 -- -- // process v0 and v1 -- xtn v16.4h, v0.4s -- ushr v0.4s, v0.4s, #10 -- xtn v17.4h, v0.4s -- ushr v0.4s, v0.4s, #10 -- xtn v18.4h, v0.4s -- -- xtn2 v16.8h, v1.4s -- and v16.16b, v16.16b, v22.16b -- ushr v1.4s, v1.4s, #10 -- xtn2 v17.8h, v1.4s -- and v17.16b, v17.16b, v22.16b -- ushr v1.4s, v1.4s, #10 -- xtn2 v18.8h, v1.4s -- and v18.16b, v18.16b, v22.16b -- -- st3 { v16.8h, v17.8h, v18.8h }, [x0], #48 -- -- // process v2 and v3 -- xtn v23.4h, v2.4s -- ushr v2.4s, v2.4s, #10 -- xtn v24.4h, v2.4s -- ushr v2.4s, v2.4s, #10 -- xtn v25.4h, v2.4s -- -- xtn2 v23.8h, v3.4s -- and v23.16b, v23.16b, v22.16b -- ushr v3.4s, v3.4s, #10 -- xtn2 v24.8h, v3.4s -- and v24.16b, v24.16b, v22.16b -- ushr v3.4s, v3.4s, #10 -- xtn2 v25.8h, v3.4s -- and v25.16b, v25.16b, v22.16b -- -- st3 { v23.8h, v24.8h, v25.8h }, [x0], #48 -- -- // load the second half of the block -> 64 bytes into registers v4-v7 -- ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x13], #64 -- -- // process v4 and v5 -- xtn v16.4h, v4.4s -- ushr v4.4s, v4.4s, #10 -- xtn v17.4h, v4.4s -- ushr v4.4s, v4.4s, #10 -- xtn v18.4h, v4.4s -- -- xtn2 v16.8h, v5.4s -- and v16.16b, v16.16b, v22.16b -- ushr v5.4s, v5.4s, #10 -- xtn2 v17.8h, v5.4s -- and v17.16b, v17.16b, v22.16b -- ushr v5.4s, v5.4s, #10 -- xtn2 v18.8h, v5.4s -- and v18.16b, v18.16b, v22.16b -- -- st3 { v16.8h, v17.8h, v18.8h }, [x0], #48 -- -- // v6 and v7 -- xtn v23.4h, v6.4s -- ushr v6.4s, v6.4s, #10 -- xtn v24.4h, v6.4s -- ushr v6.4s, v6.4s, #10 -- xtn v25.4h, v6.4s -- -- xtn2 v23.8h, v7.4s -- and v23.16b, v23.16b, v22.16b -- ushr v7.4s, v7.4s, #10 -- xtn2 v24.8h, v7.4s -- and v24.16b, v24.16b, v22.16b -- ushr v7.4s, v7.4s, #10 -- xtn2 v25.8h, v7.4s -- and v25.16b, v25.16b, v22.16b -- -- st3 { v23.8h, v24.8h, v25.8h }, [x0], #48 -- -- add x13, x13, x5 // row src += slice_inc -- add w14, w14, #1 -- b block_loop_y16 --block_loop_y16_fin: -- -- -- -- -- add x2, x2, #128 // src += stride1 (start of the next row) -- add x0, x0, w20, sxtw // subtract the bytes we copied too much from dst -- add w12, w12, #1 -- b row_loop_y16 --row_loop_y16_fin: -- -- // check whether we have incomplete blocks at the end of every row -- // in that case decrease row block count by one -- // change height back to it's original value (meaning increase it by 1) -- // and jump back to another iteration of row_loop_y16 -- -- cmp w23, #1 -- beq row_loop_y16_fin2 // don't continue here if we already processed the last row -- add w6, w6, #1 // increase height to the original value -- sub w9, w9, w22 // block count - 1 or 0, depending on the remaining bytes count -- mov w23, #1 -- b row_loop_y16 --row_loop_y16_fin2: -- -- sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference -- -- // now we've got to handle the last block in the last row -- eor w12, w12, w12 // w12 = 0 = counter --integer_loop_y16: -- cmp w12, w10 -- bge integer_loop_y16_fin -- ldr w14, [x13], #4 -- and w15, w14, #0x3ff -- strh w15, [x0], #2 -- lsr w14, w14, #10 -- and w15, w14, #0x3ff -- strh w15, [x0], #2 -- lsr w14, w14, #10 -- and w15, w14, #0x3ff -- strh w15, [x0], #2 -- add w12, w12, #1 -- b integer_loop_y16 --integer_loop_y16_fin: -- --final_values_y16: -- // remaining point count = w11 -- ldr w14, [x13], #4 -- cmp w11, #0 -- beq final_values_y16_fin -- and w15, w14, #0x3ff -- strh w15, [x0], #2 -- cmp w11, #1 -- beq final_values_y16_fin -- lsr w14, w14, #10 -- and w15, w14, #0x3ff -- strh w15, [x0], #2 --final_values_y16_fin: -- -- ldp x23, x24, [sp, #32] -- ldp x21, x22, [sp, #16] -- ldp x19, x20, [sp], #48 -- ret --endfunc -- - //void ff_rpi_sand30_lines_to_planar_c16( - // uint8_t * dst_u, // [x0] - // unsigned int dst_stride_u, // [w1] == _w*2 -@@ -674,3 +452,330 @@ endfunc - // unsigned int _w, - // unsigned int h); - -+// void ff_rpi_sand30_lines_to_planar_y8( -+// uint8_t * dest, : x0 -+// unsigned int dst_stride, : w1 -+// const uint8_t * src, : x2 -+// unsigned int src_stride1, : w3, always 128 -+// unsigned int src_stride2, : w4 -+// unsigned int _x, : w5 -+// unsigned int y, : w6 -+// unsigned int _w, : w7 -+// unsigned int h); : [sp, #0] -+// -+// Assumes that we are starting on a stripe boundary and that overreading -+// within the stripe is OK. However it does respect the dest size for wri -+ -+function ff_rpi_sand30_lines_to_planar_y16, export=1 -+ lsl w4, w4, #7 -+ sub w4, w4, #64 -+ sub w1, w1, w7, lsl #1 -+ uxtw x6, w6 -+ add x8, x2, x6, lsl #7 -+ ldr w6, [sp, #0] -+ -+10: -+ mov x2, x8 -+ mov w5, w7 -+1: -+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 -+ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4 -+ -+ subs w5, w5, #96 -+ -+ // v0, v1 -+ -+ shrn v18.4h, v0.4s, #14 -+ xtn v16.4h, v0.4s -+ shrn v17.4h, v0.4s, #10 -+ -+ shrn2 v18.8h, v1.4s, #14 -+ xtn2 v16.8h, v1.4s -+ shrn2 v17.8h, v1.4s, #10 -+ -+ ushr v18.8h, v18.8h, #6 -+ bic v16.8h, #0xfc, lsl #8 -+ bic v17.8h, #0xfc, lsl #8 -+ -+ // v2, v3 -+ -+ shrn v21.4h, v2.4s, #14 -+ xtn v19.4h, v2.4s -+ shrn v20.4h, v2.4s, #10 -+ -+ shrn2 v21.8h, v3.4s, #14 -+ xtn2 v19.8h, v3.4s -+ shrn2 v20.8h, v3.4s, #10 -+ -+ ushr v21.8h, v21.8h, #6 -+ bic v19.8h, #0xfc, lsl #8 -+ bic v20.8h, #0xfc, lsl #8 -+ -+ // v4, v5 -+ -+ shrn v24.4h, v4.4s, #14 -+ xtn v22.4h, v4.4s -+ shrn v23.4h, v4.4s, #10 -+ -+ shrn2 v24.8h, v5.4s, #14 -+ xtn2 v22.8h, v5.4s -+ shrn2 v23.8h, v5.4s, #10 -+ -+ ushr v24.8h, v24.8h, #6 -+ bic v22.8h, #0xfc, lsl #8 -+ bic v23.8h, #0xfc, lsl #8 -+ -+ // v6, v7 -+ -+ shrn v27.4h, v6.4s, #14 -+ xtn v25.4h, v6.4s -+ shrn v26.4h, v6.4s, #10 -+ -+ shrn2 v27.8h, v7.4s, #14 -+ xtn2 v25.8h, v7.4s -+ shrn2 v26.8h, v7.4s, #10 -+ -+ ushr v27.8h, v27.8h, #6 -+ bic v25.8h, #0xfc, lsl #8 -+ bic v26.8h, #0xfc, lsl #8 -+ -+ blt 2f -+ -+ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 -+ st3 {v19.8h, v20.8h, v21.8h}, [x0], #48 -+ st3 {v22.8h, v23.8h, v24.8h}, [x0], #48 -+ st3 {v25.8h, v26.8h, v27.8h}, [x0], #48 -+ -+ bne 1b -+ -+11: -+ subs w6, w6, #1 -+ add x0, x0, w1, uxtw -+ add x8, x8, #128 -+ bne 10b -+ -+ ret -+ -+// Partial final write -+2: -+ cmp w5, #48-96 -+ blt 1f -+ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 -+ st3 {v19.8h, v20.8h, v21.8h}, [x0], #48 -+ beq 11b -+ mov v16.16b, v22.16b -+ mov v17.16b, v23.16b -+ sub w5, w5, #48 -+ mov v18.16b, v24.16b -+ mov v19.16b, v25.16b -+ mov v20.16b, v26.16b -+ mov v21.16b, v27.16b -+1: -+ cmp w5, #24-96 -+ blt 1f -+ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 -+ beq 11b -+ mov v16.16b, v19.16b -+ mov v17.16b, v20.16b -+ sub w5, w5, #24 -+ mov v18.16b, v21.16b -+1: -+ cmp w5, #12-96 -+ blt 1f -+ st3 {v16.4h, v17.4h, v18.4h}, [x0], #24 -+ beq 11b -+ mov v16.2d[0], v16.2d[1] -+ sub w5, w5, #12 -+ mov v17.2d[0], v17.2d[1] -+ mov v18.2d[0], v18.2d[1] -+1: -+ cmp w5, #6-96 -+ blt 1f -+ st3 {v16.h, v17.h, v18.h}[0], [x0], #6 -+ st3 {v16.h, v17.h, v18.h}[1], [x0], #6 -+ beq 11b -+ mov v16.2s[0], v16.2s[1] -+ sub w5, w5, #6 -+ mov v17.2s[0], v17.2s[1] -+ mov v18.2s[0], v18.2s[1] -+1: -+ cmp w5, #3-96 -+ blt 1f -+ st3 {v16.h, v17.h, v18.h}[0], [x0], #6 -+ beq 11b -+ mov v16.4h[0], v16.4h[1] -+ sub w5, w5, #3 -+ mov v17.4h[0], v17.4h[1] -+1: -+ cmp w5, #2-96 -+ blt 1f -+ st2 {v16.h, v17.h}[0], [x0], #4 -+ b 11b -+1: -+ st1 {v16.h}[0], [x0], #2 -+ b 11b -+ -+endfunc -+ -+// void ff_rpi_sand30_lines_to_planar_y8( -+// uint8_t * dest, : x0 -+// unsigned int dst_stride, : w1 -+// const uint8_t * src, : x2 -+// unsigned int src_stride1, : w3, always 128 -+// unsigned int src_stride2, : w4 -+// unsigned int _x, : w5 -+// unsigned int y, : w6 -+// unsigned int _w, : w7 -+// unsigned int h); : [sp, #0] -+// -+// Assumes that we are starting on a stripe boundary and that overreading -+// within the stripe is OK. However it does respect the dest size for wri -+ -+function ff_rpi_sand30_lines_to_planar_y8, export=1 -+ lsl w4, w4, #7 -+ sub w4, w4, #64 -+ sub w1, w1, w7 -+ uxtw x6, w6 -+ add x8, x2, x6, lsl #7 -+ ldr w6, [sp, #0] -+ -+10: -+ mov x2, x8 -+ mov w5, w7 -+1: -+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 -+ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4 -+ -+ subs w5, w5, #96 -+ -+ // v0, v1 -+ -+ shrn v18.4h, v0.4s, #16 -+ xtn v16.4h, v0.4s -+ shrn v17.4h, v0.4s, #12 -+ -+ shrn2 v18.8h, v1.4s, #16 -+ xtn2 v16.8h, v1.4s -+ shrn2 v17.8h, v1.4s, #12 -+ -+ shrn v18.8b, v18.8h, #6 -+ shrn v16.8b, v16.8h, #2 -+ xtn v17.8b, v17.8h -+ -+ // v2, v3 -+ -+ shrn v21.4h, v2.4s, #16 -+ xtn v19.4h, v2.4s -+ shrn v20.4h, v2.4s, #12 -+ -+ shrn2 v21.8h, v3.4s, #16 -+ xtn2 v19.8h, v3.4s -+ shrn2 v20.8h, v3.4s, #12 -+ -+ shrn2 v18.16b, v21.8h, #6 -+ shrn2 v16.16b, v19.8h, #2 -+ xtn2 v17.16b, v20.8h -+ -+ // v4, v5 -+ -+ shrn v24.4h, v4.4s, #16 -+ xtn v22.4h, v4.4s -+ shrn v23.4h, v4.4s, #12 -+ -+ shrn2 v24.8h, v5.4s, #16 -+ xtn2 v22.8h, v5.4s -+ shrn2 v23.8h, v5.4s, #12 -+ -+ shrn v21.8b, v24.8h, #6 -+ shrn v19.8b, v22.8h, #2 -+ xtn v20.8b, v23.8h -+ -+ // v6, v7 -+ -+ shrn v27.4h, v6.4s, #16 -+ xtn v25.4h, v6.4s -+ shrn v26.4h, v6.4s, #12 -+ -+ shrn2 v27.8h, v7.4s, #16 -+ xtn2 v25.8h, v7.4s -+ shrn2 v26.8h, v7.4s, #12 -+ -+ shrn2 v21.16b, v27.8h, #6 -+ shrn2 v19.16b, v25.8h, #2 -+ xtn2 v20.16b, v26.8h -+ -+ blt 2f -+ -+ st3 {v16.16b, v17.16b, v18.16b}, [x0], #48 -+ st3 {v19.16b, v20.16b, v21.16b}, [x0], #48 -+ -+ bne 1b -+ -+11: -+ subs w6, w6, #1 -+ add x0, x0, w1, uxtw -+ add x8, x8, #128 -+ bne 10b -+ -+ ret -+ -+// Partial final write -+2: -+ cmp w5, #48-96 -+ blt 1f -+ st3 {v16.16b, v17.16b, v18.16b}, [x0], #48 -+ beq 11b -+ mov v16.16b, v22.16b -+ mov v17.16b, v23.16b -+ sub w5, w5, #48 -+ mov v18.16b, v24.16b -+1: -+ cmp w5, #24-96 -+ blt 1f -+ st3 {v16.8b, v17.8b, v18.8b}, [x0], #24 -+ beq 11b -+ mov v16.2d[0], v16.2d[1] -+ sub w5, w5, #24 -+ mov v17.2d[0], v17.2d[1] -+ mov v18.2d[0], v18.2d[1] -+1: -+ cmp w5, #12-96 -+ blt 1f -+ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 -+ st3 {v16.b, v17.b, v18.b}[1], [x0], #3 -+ st3 {v16.b, v17.b, v18.b}[2], [x0], #3 -+ st3 {v16.b, v17.b, v18.b}[3], [x0], #3 -+ beq 11b -+ mov v16.2s[0], v16.2s[1] -+ sub w5, w5, #12 -+ mov v17.2s[0], v17.2s[1] -+ mov v18.2s[0], v18.2s[1] -+1: -+ cmp w5, #6-96 -+ blt 1f -+ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 -+ st3 {v16.b, v17.b, v18.b}[1], [x0], #3 -+ beq 11b -+ mov v16.4h[0], v16.4h[1] -+ sub w5, w5, #6 -+ mov v17.4h[0], v17.4h[1] -+ mov v18.4h[0], v18.4h[1] -+1: -+ cmp w5, #3-96 -+ blt 1f -+ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 -+ beq 11b -+ mov v16.8b[0], v16.8b[1] -+ sub w5, w5, #3 -+ mov v17.8b[0], v17.8b[1] -+1: -+ cmp w5, #2-96 -+ blt 1f -+ st2 {v16.b, v17.b}[0], [x0], #2 -+ b 11b -+1: -+ st1 {v16.b}[0], [x0], #1 -+ b 11b -+ -+endfunc -+ -diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h -index b3aa481ea4..2a56135bc3 100644 ---- a/libavutil/aarch64/rpi_sand_neon.h -+++ b/libavutil/aarch64/rpi_sand_neon.h -@@ -49,6 +49,10 @@ void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_ - uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1, - unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); - -+void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride, -+ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, -+ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); -+ - #ifdef __cplusplus - } - #endif -diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S -index 80890fe985..60e697f681 100644 ---- a/libavutil/arm/rpi_sand_neon.S -+++ b/libavutil/arm/rpi_sand_neon.S -@@ -360,7 +360,6 @@ function ff_rpi_sand30_lines_to_planar_y16, export=1 - ldr r6, [sp, #36] - ldr r7, [sp, #32] @ y - mov r12, #48 -- vmov.u16 q15, #0x3ff - sub r3, #1 - lsl r3, #7 - sub r1, r1, r6, lsl #1 -@@ -376,37 +375,33 @@ function ff_rpi_sand30_lines_to_planar_y16, export=1 - vldm r2!, {q10-q13} - add lr, #64 - -- vshr.u32 q14, q10, #20 @ Cannot vshrn.u32 #20! -+ vshrn.u32 d4 , q10, #14 @ Cannot vshrn.u32 #20! - ands lr, #127 - vshrn.u32 d2, q10, #10 - vmovn.u32 d0, q10 -- vmovn.u32 d4, q14 - -- vshr.u32 q14, q11, #20 -+ vshrn.u32 d5, q11, #14 - it eq - addeq r2, r3 - vshrn.u32 d3, q11, #10 - vmovn.u32 d1, q11 -- vmovn.u32 d5, q14 - - subs r5, #48 -- vand q0, q15 -- vand q1, q15 -- vand q2, q15 -+ vshr.u16 q2, #6 -+ vbic.u16 q0, #0xfc00 -+ vbic.u16 q1, #0xfc00 - -- vshr.u32 q14, q12, #20 -+ vshrn.u32 d20, q12, #14 - vshrn.u32 d18, q12, #10 - vmovn.u32 d16, q12 -- vmovn.u32 d20, q14 - -- vshr.u32 q14, q13, #20 -+ vshrn.u32 d21, q13, #14 - vshrn.u32 d19, q13, #10 - vmovn.u32 d17, q13 -- vmovn.u32 d21, q14 - -- vand q8, q15 -- vand q9, q15 -- vand q10, q15 -+ vshr.u16 q10, #6 -+ vbic.u16 q8, #0xfc00 -+ vbic.u16 q9 , #0xfc00 - blt 2f - - vst3.16 {d0, d2, d4}, [r0], r12 -@@ -499,7 +494,6 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1 - ldr r7, [sp, #48] - ldr r9, [sp, #52] - mov r12, #48 -- vmov.u16 q15, #0x3ff - sub r8, #1 - lsl r8, #7 - add r5, r5, r7, lsl #7 -@@ -515,48 +509,44 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1 - add lr, #64 - - @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2 -- vshr.u32 q14, q0, #20 -- vshrn.u32 d16, q0, #10 -+ vshrn.u32 d20, q0, #14 - vmovn.u32 d18, q0 -+ vshrn.u32 d0, q0, #10 - ands lr, #127 -- vmovn.u32 d20, q14 - -- vshr.u32 q14, q1, #20 -- vshrn.u32 d17, q1, #10 -+ vshrn.u32 d21, q1, #14 - vmovn.u32 d19, q1 -- vmovn.u32 d21, q14 -+ vshrn.u32 d1, q1, #10 - -- vshr.u32 q14, q2, #20 - vshrn.u32 d22, q2, #10 -- vmovn.u32 d24, q2 -- vmovn.u32 d26, q14 -+ vmovn.u32 d2, q2 -+ vshrn.u32 d4, q2, #14 - -- vshr.u32 q14, q3, #20 -- vshrn.u32 d23, q3, #10 -- vmovn.u32 d25, q3 - add r10, r0, #24 -- vmovn.u32 d27, q14 -+ vshrn.u32 d23, q3, #10 -+ vmovn.u32 d3, q3 -+ vshrn.u32 d5, q3, #14 - - it eq - addeq r4, r8 -- vuzp.16 q8, q11 -- vuzp.16 q9, q12 -- vuzp.16 q10, q13 -+ vuzp.16 q0, q11 -+ vuzp.16 q9, q1 -+ vuzp.16 q10, q2 - -- @ q8 V0, V3,.. -> q0 -+ @ q0 V0, V3,.. - @ q9 U0, U3... - @ q10 U1, U4... - @ q11 U2, U5,.. -- @ q12 V1, V4,.. -> q1 -- @ q13 V2, V5,.. -> q2 -+ @ q1 V1, V4, -+ @ q2 V2, V5,.. - - subs r6, #24 -- vand q11, q15 -- vand q9, q15 -- vand q10, q15 -- vand q0, q8, q15 -- vand q1, q12, q15 -- vand q2, q13, q15 -+ vbic.u16 q11, #0xfc00 -+ vbic.u16 q9, #0xfc00 -+ vshr.u16 q10, #6 -+ vshr.u16 q2, #6 -+ vbic.u16 q0, #0xfc00 -+ vbic.u16 q1, #0xfc00 - - blt 2f - -@@ -765,4 +755,171 @@ function ff_rpi_sand30_lines_to_planar_p010, export=1 - endfunc - - -+@ void ff_rpi_sand30_lines_to_planar_y8( -+@ uint8_t * dest, // [r0] -+@ unsigned int dst_stride, // [r1] -+@ const uint8_t * src, // [r2] -+@ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+@ unsigned int src_stride2, // [sp, #0] -> r3 -+@ unsigned int _x, // [sp, #4] Ignored - 0 -+@ unsigned int y, // [sp, #8] (r7 in prefix) -+@ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+@ unsigned int h); // [sp, #16] -> r7 -+@ -+@ Assumes that we are starting on a stripe boundary and that overreading -+@ within the stripe is OK. However it does respect the dest size for wri -+ -+function ff_rpi_sand30_lines_to_planar_y8, export=1 -+ push {r4-r8, lr} @ +24 -+ ldr r3, [sp, #24] -+ ldr r6, [sp, #36] -+ ldr r7, [sp, #32] @ y -+ mov r12, #48 -+ lsl r3, #7 -+ sub r1, r1, r6 -+ add r8, r2, r7, lsl #7 -+ ldr r7, [sp, #40] -+ -+10: -+ mov r2, r8 -+ add r4, r0, #24 -+ mov r5, r6 -+1: -+ vldm r2, {q8-q15} -+ -+ subs r5, #96 -+ -+ vmovn.u32 d0, q8 -+ vshrn.u32 d2, q8, #12 -+ vshrn.u32 d4, q8, #16 @ Cannot vshrn.u32 #20! -+ -+ add r2, r3 -+ -+ vmovn.u32 d1, q9 -+ vshrn.u32 d3, q9, #12 -+ vshrn.u32 d5, q9, #16 -+ -+ pld [r2, #0] -+ -+ vshrn.u16 d0, q0, #2 -+ vmovn.u16 d1, q1 -+ vshrn.u16 d2, q2, #6 -+ -+ vmovn.u32 d16, q10 -+ vshrn.u32 d18, q10, #12 -+ vshrn.u32 d20, q10, #16 -+ -+ vmovn.u32 d17, q11 -+ vshrn.u32 d19, q11, #12 -+ vshrn.u32 d21, q11, #16 -+ -+ pld [r2, #64] -+ -+ vshrn.u16 d4, q8, #2 -+ vmovn.u16 d5, q9 -+ vshrn.u16 d6, q10, #6 -+ -+ vmovn.u32 d16, q12 -+ vshrn.u32 d18, q12, #12 -+ vshrn.u32 d20, q12, #16 -+ -+ vmovn.u32 d17, q13 -+ vshrn.u32 d19, q13, #12 -+ vshrn.u32 d21, q13, #16 -+ -+ vshrn.u16 d16, q8, #2 -+ vmovn.u16 d17, q9 -+ vshrn.u16 d18, q10, #6 -+ -+ vmovn.u32 d20, q14 -+ vshrn.u32 d22, q14, #12 -+ vshrn.u32 d24, q14, #16 -+ -+ vmovn.u32 d21, q15 -+ vshrn.u32 d23, q15, #12 -+ vshrn.u32 d25, q15, #16 -+ -+ vshrn.u16 d20, q10, #2 -+ vmovn.u16 d21, q11 -+ vshrn.u16 d22, q12, #6 -+ -+ blt 2f -+ -+ vst3.8 {d0, d1, d2}, [r0], r12 -+ vst3.8 {d4, d5, d6}, [r4], r12 -+ vst3.8 {d16, d17, d18}, [r0], r12 -+ vst3.8 {d20, d21, d22}, [r4], r12 -+ -+ bne 1b -+ -+11: -+ subs r7, #1 -+ add r0, r1 -+ add r8, #128 -+ bne 10b -+ -+ pop {r4-r8, pc} -+ -+@ Partial final write -+2: -+ cmp r5, #48-96 -+ blt 1f -+ vst3.8 {d0, d1, d2}, [r0], r12 -+ vst3.8 {d4, d5, d6}, [r4], r12 -+ beq 11b -+ vmov q0, q8 -+ vmov q2, q10 -+ sub r5, #48 -+ vmov d2, d18 -+ vmov d6, d22 -+1: -+ cmp r5, #24-96 -+ blt 1f -+ vst3.8 {d0, d1, d2}, [r0]! -+ beq 11b -+ vmov q0, q2 -+ sub r5, #24 -+ vmov d2, d6 -+1: -+ cmp r5, #12-96 -+ blt 1f -+ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! -+ vst3.8 {d0[1], d1[1], d2[1]}, [r0]! -+ vst3.8 {d0[2], d1[2], d2[2]}, [r0]! -+ vst3.8 {d0[3], d1[3], d2[3]}, [r0]! -+ beq 11b -+ vmov s0, s1 -+ sub r5, #12 -+ vmov s2, s3 -+ vmov s4, s5 -+1: -+ cmp r5, #6-96 -+ blt 1f -+ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! -+ vst3.8 {d0[1], d1[1], d2[1]}, [r0]! -+ add r0, #12 -+ beq 11b -+ vshr.u32 d0, #16 -+ sub r5, #6 -+ vshr.u32 d1, #16 -+ vshr.u32 d2, #16 -+1: -+ cmp r5, #3-96 -+ blt 1f -+ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! -+ beq 11b -+ sub r5, #3 -+ vshr.u32 d0, #8 -+ vshr.u32 d1, #8 -+1: -+ cmp r5, #2-96 -+ blt 1f -+ vst2.8 {d0[0], d1[0]}, [r0]! -+ b 11b -+1: -+ vst1.8 {d0[0]}, [r0]! -+ b 11b -+ -+endfunc -+ - -diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h -index 447f367bea..d457c10870 100644 ---- a/libavutil/arm/rpi_sand_neon.h -+++ b/libavutil/arm/rpi_sand_neon.h -@@ -95,5 +95,16 @@ void ff_rpi_sand30_lines_to_planar_p010( - unsigned int _w, // [sp, #12] -> r6 (cur r5) - unsigned int h); // [sp, #16] -> r7 - -+void ff_rpi_sand30_lines_to_planar_y8( -+ uint8_t * dest, // [r0] -+ unsigned int dst_stride, // [r1] -+ const uint8_t * src, // [r2] -+ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+ unsigned int src_stride2, // [sp, #0] -> r3 -+ unsigned int _x, // [sp, #4] Ignored - 0 -+ unsigned int y, // [sp, #8] (r7 in prefix) -+ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+ unsigned int h); // [sp, #16] -> r7 -+ - #endif // AVUTIL_ARM_SAND_NEON_H - -diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c -index 256c3d532f..b6071e2928 100644 ---- a/libavutil/rpi_sand_fns.c -+++ b/libavutil/rpi_sand_fns.c -@@ -247,7 +247,7 @@ void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, - const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; - const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words - --#if HAVE_SAND_ASM && 0 -+#if HAVE_SAND_ASM - if (_x == 0) { - ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); - return; - -From 24c3eef4487a36d5189ecd934b65a7c6a0b53d03 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 7 Jun 2022 14:46:12 +0000 -Subject: [PATCH 054/136] v4l2_m2m_enc: Add the ability to encode DRM_PRIME - frames - ---- - libavcodec/v4l2_buffers.c | 100 +++++++++++--- - libavcodec/v4l2_buffers.h | 20 ++- - libavcodec/v4l2_context.c | 212 +++++++++++++++++++++++++--- - libavcodec/v4l2_context.h | 15 +- - libavcodec/v4l2_m2m.c | 37 +++-- - libavcodec/v4l2_m2m.h | 3 + - libavcodec/v4l2_m2m_dec.c | 171 ++++++----------------- - libavcodec/v4l2_m2m_enc.c | 283 +++++++++++++++++++++++++++++++++++++- - 8 files changed, 643 insertions(+), 198 deletions(-) - -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 8c4f18dbed..9ef2f40e39 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -29,6 +29,8 @@ - #include - #include - #include "libavcodec/avcodec.h" -+#include "libavcodec/internal.h" -+#include "libavutil/avassert.h" - #include "libavutil/pixdesc.h" - #include "libavutil/hwcontext.h" - #include "v4l2_context.h" -@@ -60,27 +62,39 @@ static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf) - return tb.num && tb.den ? tb : v4l2_timebase; - } - -+static inline struct timeval tv_from_int(const int64_t t) -+{ -+ return (struct timeval){ -+ .tv_usec = t % USEC_PER_SEC, -+ .tv_sec = t / USEC_PER_SEC -+ }; -+} -+ -+static inline int64_t int_from_tv(const struct timeval t) -+{ -+ return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec; -+} -+ - static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts) - { - /* convert pts to v4l2 timebase */ - const int64_t v4l2_pts = -- out->context->no_pts_rescale ? pts : - pts == AV_NOPTS_VALUE ? 0 : - av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); -- out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC; -- out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC; -+ out->buf.timestamp = tv_from_int(v4l2_pts); - } - - static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf) - { -+ const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp); -+ return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE; -+#if 0 - /* convert pts back to encoder timebase */ -- const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC + -- avbuf->buf.timestamp.tv_usec; -- - return - avbuf->context->no_pts_rescale ? v4l2_pts : - v4l2_pts == 0 ? AV_NOPTS_VALUE : - av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); -+#endif - } - - static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length) -@@ -435,7 +449,7 @@ static void v4l2_free_bufref(void *opaque, uint8_t *data) - - ff_mutex_lock(&ctx->lock); - -- avbuf->status = V4L2BUF_AVAILABLE; -+ ff_v4l2_buffer_set_avail(avbuf); - - if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) { - av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name); -@@ -599,6 +613,38 @@ static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes) - return i != 0 && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA)); - } - -+static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out) -+{ -+ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0]; -+ -+ if (frame->format != AV_PIX_FMT_DRM_PRIME || !src) -+ return AVERROR(EINVAL); -+ -+ av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF); -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { -+ // Only currently cope with single buffer types -+ if (out->buf.length != 1) -+ return AVERROR_PATCHWELCOME; -+ if (src->nb_objects != 1) -+ return AVERROR(EINVAL); -+ -+ out->planes[0].m.fd = src->objects[0].fd; -+ } -+ else { -+ if (src->nb_objects != 1) -+ return AVERROR(EINVAL); -+ -+ out->buf.m.fd = src->objects[0].fd; -+ } -+ -+ // No need to copy src AVDescriptor and if we did then we may confuse -+ // fd close on free -+ out->ref_buf = av_buffer_ref(frame->buf[0]); -+ -+ return 0; -+} -+ - static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) - { - int i; -@@ -678,7 +724,7 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) - * - ******************************************************************************/ - --int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) -+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts) - { - out->buf.flags = frame->key_frame ? - (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : -@@ -688,10 +734,15 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) - v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc); - v4l2_set_color_range(out, frame->color_range); - // PTS & interlace are buffer vars -- v4l2_set_pts(out, frame->pts); -+ if (track_ts) -+ out->buf.timestamp = tv_from_int(track_ts); -+ else -+ v4l2_set_pts(out, frame->pts); - v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first); - -- return v4l2_buffer_swframe_to_buf(frame, out); -+ return frame->format == AV_PIX_FMT_DRM_PRIME ? -+ v4l2_buffer_primeframe_to_buf(frame, out) : -+ v4l2_buffer_swframe_to_buf(frame, out); - } - - int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) -@@ -754,6 +805,7 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) - - pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused; - pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset; -+ pkt->flags = 0; - - if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME) - pkt->flags |= AV_PKT_FLAG_KEY; -@@ -768,8 +820,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) - return 0; - } - --int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, -- const void *extdata, size_t extlen) -+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out, -+ const void *extdata, size_t extlen, -+ const int64_t timestamp) - { - int ret; - -@@ -783,7 +836,10 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, - if (ret && ret != AVERROR(ENOMEM)) - return ret; - -- v4l2_set_pts(out, pkt->pts); -+ if (timestamp) -+ out->buf.timestamp = tv_from_int(timestamp); -+ else -+ v4l2_set_pts(out, pkt->pts); - - out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ? - (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : -@@ -794,7 +850,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, - - int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) - { -- return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0); -+ return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0); - } - - -@@ -814,13 +870,15 @@ static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data) - close(avbuf->drm_frame.objects[i].fd); - } - -+ av_buffer_unref(&avbuf->ref_buf); -+ - ff_weak_link_unref(&avbuf->context_wl); - - av_free(avbuf); - } - - --int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx) -+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem) - { - int ret, i; - V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf)); -@@ -837,7 +895,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct - } - - avbuf->context = ctx; -- avbuf->buf.memory = V4L2_MEMORY_MMAP; -+ avbuf->buf.memory = mem; - avbuf->buf.type = ctx->type; - avbuf->buf.index = index; - -@@ -867,6 +925,8 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct - avbuf->num_planes = 1; - - for (i = 0; i < avbuf->num_planes; i++) { -+ const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP && -+ (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm); - - avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? - ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline : -@@ -875,21 +935,17 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length; - -- if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) || -- !buf_to_m2mctx(avbuf)->output_drm) { -+ if (want_mmap) - avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, - PROT_READ | PROT_WRITE, MAP_SHARED, - buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); -- } - } else { - avbuf->plane_info[i].length = avbuf->buf.length; - -- if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) || -- !buf_to_m2mctx(avbuf)->output_drm) { -+ if (want_mmap) - avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, - PROT_READ | PROT_WRITE, MAP_SHARED, - buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); -- } - } - - if (avbuf->plane_info[i].mm_addr == MAP_FAILED) { -diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h -index 3b7ca4d99e..1ac32c5989 100644 ---- a/libavcodec/v4l2_buffers.h -+++ b/libavcodec/v4l2_buffers.h -@@ -59,6 +59,10 @@ typedef struct V4L2Buffer { - - /* DRM descriptor */ - AVDRMFrameDescriptor drm_frame; -+ /* For DRM_PRIME encode - need to keep a ref to the source buffer till we -+ * are done -+ */ -+ AVBufferRef * ref_buf; - - /* keep track of the mmap address and mmap length */ - struct V4L2Plane_info { -@@ -110,8 +114,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf); - */ - int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out); - --int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, -- const void *extdata, size_t extlen); -+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out, -+ const void *extdata, size_t extlen, -+ const int64_t timestamp); - - /** - * Extracts the data from an AVFrame to a V4L2Buffer -@@ -121,7 +126,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, - * - * @returns 0 in case of success, a negative AVERROR code otherwise - */ --int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out); -+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts); - - /** - * Initializes a V4L2Buffer -@@ -131,7 +136,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out); - * - * @returns 0 in case of success, a negative AVERROR code otherwise - */ --int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx); -+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem); - - /** - * Enqueues a V4L2Buffer -@@ -142,5 +147,12 @@ int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context - */ - int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf); - -+static inline void -+ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf) -+{ -+ avbuf->status = V4L2BUF_AVAILABLE; -+ av_buffer_unref(&avbuf->ref_buf); -+} -+ - - #endif // AVCODEC_V4L2_BUFFERS_H -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index b3662aedaa..7a707d21fc 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -43,6 +43,160 @@ struct v4l2_format_update { - int update_avfmt; - }; - -+ -+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n) -+{ -+ return (int64_t)n; -+} -+ -+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts) -+{ -+ return (unsigned int)pts; -+} -+ -+// FFmpeg requires us to propagate a number of vars from the coded pkt into -+// the decoded frame. The only thing that tracks like that in V4L2 stateful -+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no -+// guarantees about PTS being unique or specified for every frame so replace -+// the supplied PTS with a simple incrementing number and keep a circular -+// buffer of all the things we want preserved (including the original PTS) -+// indexed by the tracking no. -+static int64_t -+xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt) -+{ -+ int64_t track_pts; -+ -+ // Avoid 0 -+ if (++x->track_no == 0) -+ x->track_no = 1; -+ -+ track_pts = track_to_pts(avctx, x->track_no); -+ -+ av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no); -+ x->last_pkt_dts = avpkt->dts; -+ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ -+ .discard = 0, -+ .pending = 1, -+ .pkt_size = avpkt->size, -+ .pts = avpkt->pts, -+ .dts = avpkt->dts, -+ .reordered_opaque = avctx->reordered_opaque, -+ .pkt_pos = avpkt->pos, -+ .pkt_duration = avpkt->duration, -+ .track_pts = track_pts -+ }; -+ return track_pts; -+} -+ -+static int64_t -+xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame) -+{ -+ int64_t track_pts; -+ -+ // Avoid 0 -+ if (++x->track_no == 0) -+ x->track_no = 1; -+ -+ track_pts = track_to_pts(avctx, x->track_no); -+ -+ av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no); -+ x->last_pkt_dts = frame->pkt_dts; -+ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ -+ .discard = 0, -+ .pending = 1, -+ .pkt_size = 0, -+ .pts = frame->pts, -+ .dts = AV_NOPTS_VALUE, -+ .reordered_opaque = frame->reordered_opaque, -+ .pkt_pos = frame->pkt_pos, -+ .pkt_duration = frame->pkt_duration, -+ .track_pts = track_pts -+ }; -+ return track_pts; -+} -+ -+ -+// Returns -1 if we should discard the frame -+static int -+xlat_pts_frame_out(AVCodecContext *const avctx, -+ xlat_track_t * const x, -+ AVFrame *const frame) -+{ -+ unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE; -+ V4L2m2mTrackEl *const t = x->track_els + n; -+ if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts) -+ { -+ av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING, -+ "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); -+ frame->pts = AV_NOPTS_VALUE; -+ frame->pkt_dts = x->last_pkt_dts; -+ frame->reordered_opaque = x->last_opaque; -+ frame->pkt_pos = -1; -+ frame->pkt_duration = 0; -+ frame->pkt_size = -1; -+ } -+ else if (!t->discard) -+ { -+ frame->pts = t->pending ? t->pts : AV_NOPTS_VALUE; -+ frame->pkt_dts = x->last_pkt_dts; -+ frame->reordered_opaque = t->reordered_opaque; -+ frame->pkt_pos = t->pkt_pos; -+ frame->pkt_duration = t->pkt_duration; -+ frame->pkt_size = t->pkt_size; -+ -+ x->last_opaque = x->track_els[n].reordered_opaque; -+ if (frame->pts != AV_NOPTS_VALUE) -+ x->last_pts = frame->pts; -+ t->pending = 0; -+ } -+ else -+ { -+ av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); -+ return -1; -+ } -+ -+ av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n", -+ frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n); -+ return 0; -+} -+ -+// Returns -1 if we should discard the frame -+static int -+xlat_pts_pkt_out(AVCodecContext *const avctx, -+ xlat_track_t * const x, -+ AVPacket *const pkt) -+{ -+ unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE; -+ V4L2m2mTrackEl *const t = x->track_els + n; -+ if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts) -+ { -+ av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING, -+ "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts); -+ pkt->pts = AV_NOPTS_VALUE; -+ } -+ else if (!t->discard) -+ { -+ pkt->pts = t->pending ? t->pts : AV_NOPTS_VALUE; -+ -+ x->last_opaque = x->track_els[n].reordered_opaque; -+ if (pkt->pts != AV_NOPTS_VALUE) -+ x->last_pts = pkt->pts; -+ t->pending = 0; -+ } -+ else -+ { -+ av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts); -+ return -1; -+ } -+ -+ // * Would like something much better than this...xlat(offset + out_count)? -+ pkt->dts = pkt->pts; -+ av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n", -+ pkt->pts, t->track_pts, n); -+ return 0; -+} -+ -+ - static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx) - { - return V4L2_TYPE_IS_OUTPUT(ctx->type) ? -@@ -353,12 +507,14 @@ dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf) - atomic_fetch_sub(&ctx->q_count, 1); - - avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; -- avbuf->status = V4L2BUF_AVAILABLE; -+ ff_v4l2_buffer_set_avail(avbuf); - avbuf->buf = buf; - if (is_mp) { - memcpy(avbuf->planes, planes, sizeof(planes)); - avbuf->buf.m.planes = avbuf->planes; - } -+ // Done with any attached buffer -+ av_buffer_unref(&avbuf->ref_buf); - - if (V4L2_TYPE_IS_CAPTURE(ctx->type)) { - // Zero length cap buffer return == EOS -@@ -733,7 +889,7 @@ static void flush_all_buffers_status(V4L2Context* const ctx) - for (i = 0; i < ctx->num_buffers; ++i) { - struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; - if (buf->status == V4L2BUF_IN_DRIVER) -- buf->status = V4L2BUF_AVAILABLE; -+ ff_v4l2_buffer_set_avail(buf); - } - atomic_store(&ctx->q_count, 0); - } -@@ -787,6 +943,8 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) - { - if (cmd == VIDIOC_STREAMOFF) - flush_all_buffers_status(ctx); -+ else -+ ctx->first_buf = 1; - - ctx->streamon = (cmd == VIDIOC_STREAMON); - av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name, -@@ -803,14 +961,16 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) - - int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) - { -- V4L2m2mContext *s = ctx_to_m2mctx(ctx); -+ V4L2m2mContext *const s = ctx_to_m2mctx(ctx); -+ AVCodecContext *const avctx = s->avctx; -+ int64_t track_ts; - V4L2Buffer* avbuf; - int ret; - - if (!frame) { - ret = v4l2_stop_encode(ctx); - if (ret) -- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name); -+ av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name); - s->draining= 1; - return 0; - } -@@ -819,7 +979,9 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) - if (!avbuf) - return AVERROR(EAGAIN); - -- ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf); -+ track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame); -+ -+ ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts); - if (ret) - return ret; - -@@ -830,14 +992,16 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, - const void * extdata, size_t extlen) - { - V4L2m2mContext *s = ctx_to_m2mctx(ctx); -+ AVCodecContext *const avctx = s->avctx; - V4L2Buffer* avbuf; - int ret; -+ int64_t track_ts; - - if (!pkt->size) { - ret = v4l2_stop_decode(ctx); - // Log but otherwise ignore stop failure - if (ret) -- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret); -+ av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret); - s->draining = 1; - return 0; - } -@@ -846,7 +1010,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, - if (!avbuf) - return AVERROR(EAGAIN); - -- ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen); -+ track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt); -+ -+ ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts); - if (ret == AVERROR(ENOMEM)) - av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n", - __func__, pkt->size, avbuf->planes[0].length); -@@ -858,24 +1024,36 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, - - int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) - { -+ V4L2m2mContext *s = ctx_to_m2mctx(ctx); -+ AVCodecContext *const avctx = s->avctx; - V4L2Buffer *avbuf; - int rv; - -- if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) -- return rv; -+ do { -+ if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) -+ return rv; -+ if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0) -+ return rv; -+ } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0); - -- return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); -+ return 0; - } - - int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) - { -+ V4L2m2mContext *s = ctx_to_m2mctx(ctx); -+ AVCodecContext *const avctx = s->avctx; - V4L2Buffer *avbuf; - int rv; - -- if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0) -- return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC -+ do { -+ if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0) -+ return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC -+ if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0) -+ return rv; -+ } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0); - -- return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf); -+ return 0; - } - - int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) -@@ -951,7 +1129,7 @@ void ff_v4l2_context_release(V4L2Context* ctx) - } - - --static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers) -+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem) - { - V4L2m2mContext * const s = ctx_to_m2mctx(ctx); - struct v4l2_requestbuffers req; -@@ -962,7 +1140,7 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers - - memset(&req, 0, sizeof(req)); - req.count = req_buffers; -- req.memory = V4L2_MEMORY_MMAP; -+ req.memory = mem; - req.type = ctx->type; - while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) { - if (errno != EINTR) { -@@ -986,7 +1164,7 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers - } - - for (i = 0; i < ctx->num_buffers; i++) { -- ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx); -+ ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem); - if (ret) { - av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret)); - goto fail_release; -@@ -1052,7 +1230,7 @@ int ff_v4l2_context_init(V4L2Context* ctx) - goto fail_unref_hwframes; - } - -- ret = create_buffers(ctx, ctx->num_buffers); -+ ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem); - if (ret < 0) - goto fail_unref_hwframes; - -diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h -index 0efff58f18..21265f1bd7 100644 ---- a/libavcodec/v4l2_context.h -+++ b/libavcodec/v4l2_context.h -@@ -91,11 +91,19 @@ typedef struct V4L2Context { - */ - int num_buffers; - -+ /** -+ * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF -+ */ -+ enum v4l2_memory buf_mem; -+ - /** - * Whether the stream has been started (VIDIOC_STREAMON has been sent). - */ - int streamon; - -+ /* 1st buffer after stream on */ -+ int first_buf; -+ - /** - * Either no more buffers available or an unrecoverable error was notified - * by the V4L2 kernel driver: once set the context has to be exited. -@@ -105,11 +113,10 @@ typedef struct V4L2Context { - int flag_last; - - /** -- * PTS rescale not wanted -- * If the PTS is just a dummy frame count then rescale is -- * actively harmful -+ * If NZ then when Qing frame/pkt use this rather than the -+ * "real" PTS - */ -- int no_pts_rescale; -+ uint64_t track_ts; - - AVBufferRef *frames_ref; - atomic_int q_count; -diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c -index 6dd01e2e00..1e30d15fd8 100644 ---- a/libavcodec/v4l2_m2m.c -+++ b/libavcodec/v4l2_m2m.c -@@ -35,6 +35,14 @@ - #include "v4l2_fmt.h" - #include "v4l2_m2m.h" - -+static void -+xlat_init(xlat_track_t * const x) -+{ -+ memset(x, 0, sizeof(*x)); -+ x->last_pts = AV_NOPTS_VALUE; -+} -+ -+ - static inline int v4l2_splane_video(struct v4l2_capability *cap) - { - if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT) && -@@ -67,7 +75,9 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe) - - s->capture.done = s->output.done = 0; - s->capture.name = "capture"; -+ s->capture.buf_mem = V4L2_MEMORY_MMAP; - s->output.name = "output"; -+ s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; - atomic_init(&s->refcount, 0); - sem_init(&s->refsync, 0, 0); - -@@ -334,35 +344,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv) - return v4l2_configure_contexts(s); - } - --int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s) -+int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps) - { -- *s = av_mallocz(sizeof(V4L2m2mContext)); -- if (!*s) -+ V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext)); -+ -+ *pps = NULL; -+ if (!s) - return AVERROR(ENOMEM); - -- priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext), -+ priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s), - &v4l2_m2m_destroy_context, NULL, 0); - if (!priv->context_ref) { -- av_freep(s); -+ av_free(s); - return AVERROR(ENOMEM); - } - - /* assign the context */ -- priv->context = *s; -- (*s)->priv = priv; -+ priv->context = s; -+ s->priv = priv; - - /* populate it */ -- priv->context->capture.num_buffers = priv->num_capture_buffers; -- priv->context->output.num_buffers = priv->num_output_buffers; -- priv->context->self_ref = priv->context_ref; -- priv->context->fd = -1; -+ s->capture.num_buffers = priv->num_capture_buffers; -+ s->output.num_buffers = priv->num_output_buffers; -+ s->self_ref = priv->context_ref; -+ s->fd = -1; -+ xlat_init(&s->xlat); - - priv->context->frame = av_frame_alloc(); - if (!priv->context->frame) { - av_buffer_unref(&priv->context_ref); -- *s = NULL; /* freed when unreferencing context_ref */ - return AVERROR(ENOMEM); - } - -+ *pps = s; - return 0; - } -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index 19d618698d..d6cdaf65e1 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -103,6 +103,9 @@ typedef struct V4L2m2mContext { - /* generate DRM frames */ - int output_drm; - -+ /* input frames are drmprime */ -+ int input_drm; -+ - /* Frame tracking */ - xlat_track_t xlat; - int pending_hw; -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 7e17044706..fbbfc81342 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -169,96 +169,17 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s) - return 0; - } - --static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n) --{ -- return (int64_t)n; --} -- --static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts) --{ -- return (unsigned int)pts; --} -- --// FFmpeg requires us to propagate a number of vars from the coded pkt into --// the decoded frame. The only thing that tracks like that in V4L2 stateful --// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no --// guarantees about PTS being unique or specified for every frame so replace --// the supplied PTS with a simple incrementing number and keep a circular --// buffer of all the things we want preserved (including the original PTS) --// indexed by the tracking no. - static void --xlat_pts_in(AVCodecContext *const avctx, xlat_track_t *const x, AVPacket *const avpkt) --{ -- int64_t track_pts; -- -- // Avoid 0 -- if (++x->track_no == 0) -- x->track_no = 1; -- -- track_pts = track_to_pts(avctx, x->track_no); -- -- av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no); -- x->last_pkt_dts = avpkt->dts; -- x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ -- .discard = 0, -- .pending = 1, -- .pkt_size = avpkt->size, -- .pts = avpkt->pts, -- .dts = avpkt->dts, -- .reordered_opaque = avctx->reordered_opaque, -- .pkt_pos = avpkt->pos, -- .pkt_duration = avpkt->duration, -- .track_pts = track_pts -- }; -- avpkt->pts = track_pts; --} -- --// Returns -1 if we should discard the frame --static int --xlat_pts_out(AVCodecContext *const avctx, -- xlat_track_t * const x, -+set_best_effort_pts(AVCodecContext *const avctx, - pts_stats_t * const ps, - AVFrame *const frame) - { -- unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE; -- V4L2m2mTrackEl *const t = x->track_els + n; -- if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts) -- { -- av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); -- frame->pts = AV_NOPTS_VALUE; -- frame->pkt_dts = x->last_pkt_dts; -- frame->reordered_opaque = x->last_opaque; -- frame->pkt_pos = -1; -- frame->pkt_duration = 0; -- frame->pkt_size = -1; -- } -- else if (!t->discard) -- { -- frame->pts = t->pending ? t->pts : AV_NOPTS_VALUE; -- frame->pkt_dts = x->last_pkt_dts; -- frame->reordered_opaque = t->reordered_opaque; -- frame->pkt_pos = t->pkt_pos; -- frame->pkt_duration = t->pkt_duration; -- frame->pkt_size = t->pkt_size; -- -- x->last_opaque = x->track_els[n].reordered_opaque; -- if (frame->pts != AV_NOPTS_VALUE) -- x->last_pts = frame->pts; -- t->pending = 0; -- } -- else -- { -- av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); -- return -1; -- } -- - pts_stats_add(ps, frame->pts); - - frame->best_effort_timestamp = pts_stats_guess(ps); - frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? -- av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n", -- frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n); -- return 0; -+ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", -+ frame->pts, frame->best_effort_timestamp, frame->pkt_dts); - } - - static void -@@ -272,13 +193,6 @@ xlat_flush(xlat_track_t * const x) - x->last_pts = AV_NOPTS_VALUE; - } - --static void --xlat_init(xlat_track_t * const x) --{ -- memset(x, 0, sizeof(*x)); -- x->last_pts = AV_NOPTS_VALUE; --} -- - static int - xlat_pending(const xlat_track_t * const x) - { -@@ -419,8 +333,6 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const - av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret); - return ret; - } -- -- xlat_pts_in(avctx, &s->xlat, &s->buf_pkt); - } - - if (s->draining) { -@@ -542,49 +454,47 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - prefer_dq ? 5 : - src_rv == NQ_Q_FULL ? -1 : 0; - -- do { -- // Dequeue frame will unref any previous contents of frame -- // if it returns success so we don't need an explicit unref -- // when discarding -- // This returns AVERROR(EAGAIN) on timeout or if -- // there is room in the input Q and timeout == -1 -- dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); -- -- // Failure due to no buffer in Q? -- if (dst_rv == AVERROR(ENOSPC)) { -- // Wait & retry -- if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) { -- dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); -- } -+ // Dequeue frame will unref any previous contents of frame -+ // if it returns success so we don't need an explicit unref -+ // when discarding -+ // This returns AVERROR(EAGAIN) on timeout or if -+ // there is room in the input Q and timeout == -1 -+ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); -+ -+ // Failure due to no buffer in Q? -+ if (dst_rv == AVERROR(ENOSPC)) { -+ // Wait & retry -+ if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) { -+ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); - } -+ } -+ -+ // Adjust dynamic pending threshold -+ if (dst_rv == 0) { -+ if (--s->pending_hw < PENDING_HW_MIN) -+ s->pending_hw = PENDING_HW_MIN; -+ s->pending_n = 0; - -- // Adjust dynamic pending threshold -- if (dst_rv == 0) { -- if (--s->pending_hw < PENDING_HW_MIN) -- s->pending_hw = PENDING_HW_MIN; -+ set_best_effort_pts(avctx, &s->pts_stat, frame); -+ } -+ else if (dst_rv == AVERROR(EAGAIN)) { -+ if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) { -+ s->pending_hw = pending * 16 + PENDING_HW_OFFSET; - s->pending_n = 0; - } -- else if (dst_rv == AVERROR(EAGAIN)) { -- if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) { -- s->pending_hw = pending * 16 + PENDING_HW_OFFSET; -- s->pending_n = 0; -- } -- } -+ } - -- if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { -- av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); -- dst_rv = AVERROR_EOF; -- s->capture.done = 1; -- } -- else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) -- av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", -- s->draining, s->capture.done); -- else if (dst_rv && dst_rv != AVERROR(EAGAIN)) -- av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n", -- s->draining, s->capture.done, dst_rv); -- -- // Go again if we got a frame that we need to discard -- } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame)); -+ if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { -+ av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); -+ dst_rv = AVERROR_EOF; -+ s->capture.done = 1; -+ } -+ else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) -+ av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", -+ s->draining, s->capture.done); -+ else if (dst_rv && dst_rv != AVERROR(EAGAIN)) -+ av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n", -+ s->draining, s->capture.done, dst_rv); - } - - ++i; -@@ -791,7 +701,6 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - if (ret < 0) - return ret; - -- xlat_init(&s->xlat); - pts_stats_init(&s->pts_stat, avctx, "decoder"); - s->pending_hw = PENDING_HW_MIN; - -@@ -810,12 +719,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - output->av_codec_id = avctx->codec_id; - output->av_pix_fmt = AV_PIX_FMT_NONE; - output->min_buf_size = max_coded_size(avctx); -- output->no_pts_rescale = 1; - - capture->av_codec_id = AV_CODEC_ID_RAWVIDEO; - capture->av_pix_fmt = avctx->pix_fmt; - capture->min_buf_size = 0; -- capture->no_pts_rescale = 1; - - /* the client requests the codec to generate DRM frames: - * - data[0] will therefore point to the returned AVDRMFrameDescriptor -diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c -index 9a0837ecf3..05ff6ba726 100644 ---- a/libavcodec/v4l2_m2m_enc.c -+++ b/libavcodec/v4l2_m2m_enc.c -@@ -24,6 +24,8 @@ - #include - #include - #include -+#include -+ - #include "encode.h" - #include "libavcodec/avcodec.h" - #include "libavutil/pixdesc.h" -@@ -38,6 +40,34 @@ - #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x - #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x - -+// P030 should be defined in drm_fourcc.h and hopefully will be sometime -+// in the future but until then... -+#ifndef DRM_FORMAT_P030 -+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') -+#endif -+ -+#ifndef DRM_FORMAT_NV15 -+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') -+#endif -+ -+#ifndef DRM_FORMAT_NV20 -+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') -+#endif -+ -+#ifndef V4L2_CID_CODEC_BASE -+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE -+#endif -+ -+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined -+// in videodev2.h hopefully will be sometime in the future but until then... -+#ifndef V4L2_PIX_FMT_NV12_10_COL128 -+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') -+#endif -+ -+#ifndef V4L2_PIX_FMT_NV12_COL128 -+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ -+#endif -+ - static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den) - { - struct v4l2_streamparm parm = { 0 }; -@@ -148,15 +178,14 @@ static inline int v4l2_mpeg4_profile_from_ff(int p) - static int v4l2_check_b_frame_support(V4L2m2mContext *s) - { - if (s->avctx->max_b_frames) -- av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n"); -+ av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames); - -- v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0); -+ v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1); - v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0); - if (s->avctx->max_b_frames == 0) - return 0; - - avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding"); -- - return AVERROR_PATCHWELCOME; - } - -@@ -271,13 +300,184 @@ static int v4l2_prepare_encoder(V4L2m2mContext *s) - return 0; - } - -+static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame) -+{ -+ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0]; -+ -+ const uint32_t drm_fmt = src->layers[0].format; -+ // Treat INVALID as LINEAR -+ const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ? -+ DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier; -+ uint32_t pix_fmt = 0; -+ uint32_t w = 0; -+ uint32_t h = 0; -+ uint32_t bpl = src->layers[0].planes[0].pitch; -+ -+ // We really don't expect multiple layers -+ // All formats that we currently cope with are single object -+ -+ if (src->nb_layers != 1 || src->nb_objects != 1) -+ return AVERROR(EINVAL); -+ -+ switch (drm_fmt) { -+ case DRM_FORMAT_YUV420: -+ if (mod == DRM_FORMAT_MOD_LINEAR) { -+ if (src->layers[0].nb_planes != 3) -+ break; -+ pix_fmt = V4L2_PIX_FMT_YUV420; -+ h = src->layers[0].planes[1].offset / bpl; -+ w = bpl; -+ } -+ break; -+ -+ case DRM_FORMAT_NV12: -+ if (mod == DRM_FORMAT_MOD_LINEAR) { -+ if (src->layers[0].nb_planes != 2) -+ break; -+ pix_fmt = V4L2_PIX_FMT_NV12; -+ h = src->layers[0].planes[1].offset / bpl; -+ w = bpl; -+ } -+ else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { -+ if (src->layers[0].nb_planes != 2) -+ break; -+ pix_fmt = V4L2_PIX_FMT_NV12_COL128; -+ w = bpl; -+ h = src->layers[0].planes[1].offset / 128; -+ bpl = fourcc_mod_broadcom_param(mod); -+ } -+ break; -+ -+ case DRM_FORMAT_P030: -+ if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { -+ if (src->layers[0].nb_planes != 2) -+ break; -+ pix_fmt = V4L2_PIX_FMT_NV12_10_COL128; -+ w = bpl / 2; // Matching lie to how we construct this -+ h = src->layers[0].planes[1].offset / 128; -+ bpl = fourcc_mod_broadcom_param(mod); -+ } -+ break; -+ -+ default: -+ break; -+ } -+ -+ if (!pix_fmt) -+ return AVERROR(EINVAL); -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) { -+ struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp; -+ -+ pix->width = w; -+ pix->height = h; -+ pix->pixelformat = pix_fmt; -+ pix->plane_fmt[0].bytesperline = bpl; -+ pix->num_planes = 1; -+ } -+ else { -+ struct v4l2_pix_format *const pix = &format->fmt.pix; -+ -+ pix->width = w; -+ pix->height = h; -+ pix->pixelformat = pix_fmt; -+ pix->bytesperline = bpl; -+ } -+ -+ return 0; -+} -+ -+// Do we have similar enough formats to be usable? -+static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b) -+{ -+ if (a->type != b->type) -+ return 0; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) { -+ const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp; -+ const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp; -+ unsigned int i; -+ if (pa->pixelformat != pb->pixelformat || -+ pa->num_planes != pb->num_planes) -+ return 0; -+ for (i = 0; i != pa->num_planes; ++i) { -+ if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline) -+ return 0; -+ } -+ } -+ else { -+ const struct v4l2_pix_format *const pa = &a->fmt.pix; -+ const struct v4l2_pix_format *const pb = &b->fmt.pix; -+ if (pa->pixelformat != pb->pixelformat || -+ pa->bytesperline != pb->bytesperline) -+ return 0; -+ } -+ return 1; -+} -+ -+ - static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) - { - V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; - V4L2Context *const output = &s->output; - -+ // Signal EOF if needed -+ if (!frame) { -+ return ff_v4l2_context_enqueue_frame(output, frame); -+ } -+ -+ if (s->input_drm && !output->streamon) { -+ int rv; -+ struct v4l2_format req_format = {.type = output->format.type}; -+ -+ // Set format when we first get a buffer -+ if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n"); -+ return rv; -+ } -+ -+ ff_v4l2_context_release(output); -+ -+ output->format = req_format; -+ -+ if ((rv = ff_v4l2_context_set_format(output)) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n"); -+ return rv; -+ } -+ -+ if (!fmt_eq(&req_format, &output->format)) { -+ av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ output->selection.top = frame->crop_top; -+ output->selection.left = frame->crop_left; -+ output->selection.width = av_frame_cropped_width(frame); -+ output->selection.height = av_frame_cropped_height(frame); -+ -+ if ((rv = ff_v4l2_context_init(output)) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n"); -+ return rv; -+ } -+ -+ { -+ struct v4l2_selection selection = { -+ .type = V4L2_BUF_TYPE_VIDEO_OUTPUT, -+ .target = V4L2_SEL_TGT_CROP, -+ .r = output->selection -+ }; -+ if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) { -+ av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n", -+ selection.r.width, selection.r.height, selection.r.left, selection.r.top, -+ av_err2str(AVERROR(errno))); -+ } -+ av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n", -+ selection.r.width, selection.r.height, selection.r.left, selection.r.top); -+ } -+ } -+ - #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME -- if (frame && frame->pict_type == AV_PICTURE_TYPE_I) -+ if (frame->pict_type == AV_PICTURE_TYPE_I) - v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1); - #endif - -@@ -328,7 +528,70 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) - } - - dequeue: -- return ff_v4l2_context_dequeue_packet(capture, avpkt); -+ if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0) -+ return ret; -+ -+ if (capture->first_buf == 1) { -+ uint8_t * data; -+ const int len = avpkt->size; -+ -+ // 1st buffer after streamon should be SPS/PPS -+ capture->first_buf = 2; -+ -+ // Clear both possible stores so there is no chance of confusion -+ av_freep(&s->extdata_data); -+ s->extdata_size = 0; -+ av_freep(&avctx->extradata); -+ avctx->extradata_size = 0; -+ -+ if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL) -+ memcpy(data, avpkt->data, len); -+ -+ av_packet_unref(avpkt); -+ -+ if (data == NULL) -+ return AVERROR(ENOMEM); -+ -+ // We need to copy the header, but keep local if not global -+ if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) { -+ avctx->extradata = data; -+ avctx->extradata_size = len; -+ } -+ else { -+ s->extdata_data = data; -+ s->extdata_size = len; -+ } -+ -+ if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0) -+ return ret; -+ } -+ -+ // First frame must be key so mark as such even if encoder forgot -+ if (capture->first_buf == 2) -+ avpkt->flags |= AV_PKT_FLAG_KEY; -+ -+ // Add SPS/PPS to the start of every key frame if non-global headers -+ if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) { -+ const size_t newlen = s->extdata_size + avpkt->size; -+ AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE); -+ -+ if (buf == NULL) { -+ av_packet_unref(avpkt); -+ return AVERROR(ENOMEM); -+ } -+ -+ memcpy(buf->data, s->extdata_data, s->extdata_size); -+ memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size); -+ -+ av_buffer_unref(&avpkt->buf); -+ avpkt->buf = buf; -+ avpkt->data = buf->data; -+ avpkt->size = newlen; -+ } -+ -+// av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret); -+ capture->first_buf = 0; -+ return 0; - } - - static av_cold int v4l2_encode_init(AVCodecContext *avctx) -@@ -340,6 +603,8 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx) - uint32_t v4l2_fmt_output; - int ret; - -+ av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt); -+ - ret = ff_v4l2_m2m_create_context(priv, &s); - if (ret < 0) - return ret; -@@ -347,13 +612,17 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx) - capture = &s->capture; - output = &s->output; - -+ s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME); -+ - /* common settings output/capture */ - output->height = capture->height = avctx->height; - output->width = capture->width = avctx->width; - - /* output context */ - output->av_codec_id = AV_CODEC_ID_RAWVIDEO; -- output->av_pix_fmt = avctx->pix_fmt; -+ output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt : -+ avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt : -+ AV_PIX_FMT_YUV420P; - - /* capture context */ - capture->av_codec_id = avctx->codec_id; -@@ -372,7 +641,7 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx) - v4l2_fmt_output = output->format.fmt.pix.pixelformat; - - pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO); -- if (pix_fmt_output != avctx->pix_fmt) { -+ if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) { - const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output); - av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name); - return AVERROR(EINVAL); - -From 6b437ce70582c67971aa81871a6694a08b709784 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 8 Jun 2022 16:13:31 +0000 -Subject: [PATCH 055/136] v4l2_m2m_dec: Use DTS for best effort PTS if PTS is - always NO_PTS - -If we do have DTS but don't have PTS then assume PTS=DTS. -Also get rid of last_dts from tracking as its info wasn't actually -useful in any way. ---- - libavcodec/v4l2_context.c | 6 ++---- - libavcodec/v4l2_m2m.h | 1 - - libavcodec/v4l2_m2m_dec.c | 8 +++++++- - 3 files changed, 9 insertions(+), 6 deletions(-) - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index 7a707d21fc..6b97eab41e 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -73,7 +73,6 @@ xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPack - track_pts = track_to_pts(avctx, x->track_no); - - av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no); -- x->last_pkt_dts = avpkt->dts; - x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ - .discard = 0, - .pending = 1, -@@ -100,7 +99,6 @@ xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFr - track_pts = track_to_pts(avctx, x->track_no); - - av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no); -- x->last_pkt_dts = frame->pkt_dts; - x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ - .discard = 0, - .pending = 1, -@@ -129,7 +127,7 @@ xlat_pts_frame_out(AVCodecContext *const avctx, - av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING, - "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); - frame->pts = AV_NOPTS_VALUE; -- frame->pkt_dts = x->last_pkt_dts; -+ frame->pkt_dts = AV_NOPTS_VALUE; - frame->reordered_opaque = x->last_opaque; - frame->pkt_pos = -1; - frame->pkt_duration = 0; -@@ -138,7 +136,7 @@ xlat_pts_frame_out(AVCodecContext *const avctx, - else if (!t->discard) - { - frame->pts = t->pending ? t->pts : AV_NOPTS_VALUE; -- frame->pkt_dts = x->last_pkt_dts; -+ frame->pkt_dts = t->dts; - frame->reordered_opaque = t->reordered_opaque; - frame->pkt_pos = t->pkt_pos; - frame->pkt_duration = t->pkt_duration; -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index d6cdaf65e1..ee72beb052 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -67,7 +67,6 @@ typedef struct pts_stats_s - typedef struct xlat_track_s { - unsigned int track_no; - int64_t last_pts; -- int64_t last_pkt_dts; - int64_t last_opaque; - V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; - } xlat_track_t; -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index fbbfc81342..485a96f4b4 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -177,7 +177,13 @@ set_best_effort_pts(AVCodecContext *const avctx, - pts_stats_add(ps, frame->pts); - - frame->best_effort_timestamp = pts_stats_guess(ps); -- frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? -+ // If we can't guess from just PTS - try DTS -+ if (frame->best_effort_timestamp == AV_NOPTS_VALUE) -+ frame->best_effort_timestamp = frame->pkt_dts; -+ -+ // We can't emulate what s/w does in a useful manner and using the -+ // "correct" answer seems to just confuse things. -+ frame->pkt_dts = frame->pts; - av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", - frame->pts, frame->best_effort_timestamp, frame->pkt_dts); - } - -From ec8d1c2c0b6bd3544e5e30500a167fc31abde17a Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 30 Jun 2022 15:59:23 +0000 -Subject: [PATCH 056/136] v4l2: Update H265 request for current API - -This works with v9 of the H265 patch set which hopefully will be the -last one. Hevc controls extracted from patched v4l2-controls into -hevc-ctrls-v4 - if HEVC controls found in the system v4l2-controls then -those will be used instead. ---- - libavcodec/Makefile | 2 +- - libavcodec/hevc-ctrls-v4.h | 515 +++++++++++++++++++++++++++++++++ - libavcodec/v4l2_req_hevc_v4.c | 3 + - libavcodec/v4l2_req_hevc_vx.c | 81 ++++-- - libavcodec/v4l2_request_hevc.c | 6 +- - libavcodec/v4l2_request_hevc.h | 1 + - 6 files changed, 583 insertions(+), 25 deletions(-) - create mode 100644 libavcodec/hevc-ctrls-v4.h - create mode 100644 libavcodec/v4l2_req_hevc_v4.c - -diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index 2b3c16185d..d433a71236 100644 ---- a/libavcodec/Makefile -+++ b/libavcodec/Makefile -@@ -1000,7 +1000,7 @@ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o - OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o - OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec.o - OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o\ -- v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o -+ v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o v4l2_req_hevc_v4.o - OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o - OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o h265_profile_level.o - OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o -diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h -new file mode 100644 -index 0000000000..7e05f6e7c3 ---- /dev/null -+++ b/libavcodec/hevc-ctrls-v4.h -@@ -0,0 +1,515 @@ -+/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */ -+/* -+ * Video for Linux Two controls header file -+ * -+ * Copyright (C) 1999-2012 the contributors -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * Alternatively you can redistribute this file under the terms of the -+ * BSD license as stated below: -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in -+ * the documentation and/or other materials provided with the -+ * distribution. -+ * 3. The names of its contributors may not be used to endorse or promote -+ * products derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ * -+ * The contents of this header was split off from videodev2.h. All control -+ * definitions should be added to this header, which is included by -+ * videodev2.h. -+ */ -+ -+#ifndef AVCODEC_HEVC_CTRLS_V4_H -+#define AVCODEC_HEVC_CTRLS_V4_H -+ -+#include -+#include -+ -+#define V4L2_CID_STATELESS_HEVC_SPS (V4L2_CID_CODEC_STATELESS_BASE + 400) -+#define V4L2_CID_STATELESS_HEVC_PPS (V4L2_CID_CODEC_STATELESS_BASE + 401) -+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 402) -+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_STATELESS_BASE + 403) -+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 404) -+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE (V4L2_CID_CODEC_STATELESS_BASE + 405) -+#define V4L2_CID_STATELESS_HEVC_START_CODE (V4L2_CID_CODEC_STATELESS_BASE + 406) -+#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407) -+ -+enum v4l2_stateless_hevc_decode_mode { -+ V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED, -+ V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED, -+}; -+ -+enum v4l2_stateless_hevc_start_code { -+ V4L2_STATELESS_HEVC_START_CODE_NONE, -+ V4L2_STATELESS_HEVC_START_CODE_ANNEX_B, -+}; -+ -+#define V4L2_HEVC_SLICE_TYPE_B 0 -+#define V4L2_HEVC_SLICE_TYPE_P 1 -+#define V4L2_HEVC_SLICE_TYPE_I 2 -+ -+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) -+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) -+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) -+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) -+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) -+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) -+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) -+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) -+ -+/** -+ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set -+ * -+ * @video_parameter_set_id: specifies the value of the -+ * vps_video_parameter_set_id of the active VPS -+ * @seq_parameter_set_id: provides an identifier for the SPS for -+ * reference by other syntax elements -+ * @pic_width_in_luma_samples: specifies the width of each decoded picture -+ * in units of luma samples -+ * @pic_height_in_luma_samples: specifies the height of each decoded picture -+ * in units of luma samples -+ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the -+ * samples of the luma array -+ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the -+ * samples of the chroma arrays -+ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of -+ * the variable MaxPicOrderCntLsb -+ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum -+ * required size of the decoded picture -+ * buffer for the codec video sequence -+ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures -+ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the -+ * value of SpsMaxLatencyPictures array -+ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum -+ * luma coding block size -+ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between -+ * the maximum and minimum luma -+ * coding block size -+ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma -+ * transform block size -+ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between -+ * the maximum and minimum luma -+ * transform block size -+ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy -+ * depth for transform units of -+ * coding units coded in inter -+ * prediction mode -+ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy -+ * depth for transform units of -+ * coding units coded in intra -+ * prediction mode -+ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of -+ * bits used to represent each of PCM sample -+ * values of the luma component -+ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number -+ * of bits used to represent each of PCM -+ * sample values of the chroma components -+ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the -+ * minimum size of coding blocks -+ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between -+ * the maximum and minimum size of -+ * coding blocks -+ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set() -+ * syntax structures included in the SPS -+ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term -+ * reference pictures that are specified in the SPS -+ * @chroma_format_idc: specifies the chroma sampling -+ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number -+ * of temporal sub-layers -+ * @reserved: padding field. Should be zeroed by applications. -+ * @flags: see V4L2_HEVC_SPS_FLAG_{} -+ */ -+struct v4l2_ctrl_hevc_sps { -+ __u8 video_parameter_set_id; -+ __u8 seq_parameter_set_id; -+ __u16 pic_width_in_luma_samples; -+ __u16 pic_height_in_luma_samples; -+ __u8 bit_depth_luma_minus8; -+ __u8 bit_depth_chroma_minus8; -+ __u8 log2_max_pic_order_cnt_lsb_minus4; -+ __u8 sps_max_dec_pic_buffering_minus1; -+ __u8 sps_max_num_reorder_pics; -+ __u8 sps_max_latency_increase_plus1; -+ __u8 log2_min_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_luma_coding_block_size; -+ __u8 log2_min_luma_transform_block_size_minus2; -+ __u8 log2_diff_max_min_luma_transform_block_size; -+ __u8 max_transform_hierarchy_depth_inter; -+ __u8 max_transform_hierarchy_depth_intra; -+ __u8 pcm_sample_bit_depth_luma_minus1; -+ __u8 pcm_sample_bit_depth_chroma_minus1; -+ __u8 log2_min_pcm_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_pcm_luma_coding_block_size; -+ __u8 num_short_term_ref_pic_sets; -+ __u8 num_long_term_ref_pics_sps; -+ __u8 chroma_format_idc; -+ __u8 sps_max_sub_layers_minus1; -+ -+ __u8 reserved[6]; -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) -+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) -+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) -+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) -+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) -+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) -+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) -+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) -+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) -+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) -+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) -+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) -+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) -+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) -+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) -+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) -+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) -+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) -+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) -+ -+/** -+ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set -+ * -+ * @pic_parameter_set_id: identifies the PPS for reference by other -+ * syntax elements -+ * @num_extra_slice_header_bits: specifies the number of extra slice header -+ * bits that are present in the slice header RBSP -+ * for coded pictures referring to the PPS. -+ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the -+ * inferred value of num_ref_idx_l0_active_minus1 -+ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the -+ * inferred value of num_ref_idx_l1_active_minus1 -+ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for -+ * each slice referring to the PPS -+ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding -+ * tree block size and the minimum luma coding block -+ * size of coding units that convey cu_qp_delta_abs -+ * and cu_qp_delta_sign_flag -+ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb -+ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr -+ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns -+ * partitioning the picture -+ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning -+ * the picture -+ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in -+ * units of coding tree blocks -+ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in -+ * units of coding tree blocks -+ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for -+ * beta divided by 2 -+ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC -+ * divided by 2 -+ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of -+ * the variable Log2ParMrgLevel -+ * @reserved: padding field. Should be zeroed by applications. -+ * @flags: see V4L2_HEVC_PPS_FLAG_{} -+ */ -+struct v4l2_ctrl_hevc_pps { -+ __u8 pic_parameter_set_id; -+ __u8 num_extra_slice_header_bits; -+ __u8 num_ref_idx_l0_default_active_minus1; -+ __u8 num_ref_idx_l1_default_active_minus1; -+ __s8 init_qp_minus26; -+ __u8 diff_cu_qp_delta_depth; -+ __s8 pps_cb_qp_offset; -+ __s8 pps_cr_qp_offset; -+ __u8 num_tile_columns_minus1; -+ __u8 num_tile_rows_minus1; -+ __u8 column_width_minus1[20]; -+ __u8 row_height_minus1[22]; -+ __s8 pps_beta_offset_div2; -+ __s8 pps_tc_offset_div2; -+ __u8 log2_parallel_merge_level_minus2; -+ __u8 reserved; -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01 -+ -+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME 0 -+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD 1 -+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD 2 -+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM 3 -+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP 4 -+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP 5 -+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM 6 -+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING 7 -+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING 8 -+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM 9 -+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP 10 -+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM 11 -+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP 12 -+ -+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 -+ -+/** -+ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry -+ * -+ * @timestamp: timestamp of the V4L2 capture buffer to use as reference. -+ * @flags: long term flag for the reference frame -+ * @field_pic: whether the reference is a field picture or a frame. -+ * @reserved: padding field. Should be zeroed by applications. -+ * @pic_order_cnt_val: the picture order count of the current picture. -+ */ -+struct v4l2_hevc_dpb_entry { -+ __u64 timestamp; -+ __u8 flags; -+ __u8 field_pic; -+ __u16 reserved; -+ __s32 pic_order_cnt_val; -+}; -+ -+/** -+ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters -+ * -+ * @delta_luma_weight_l0: the difference of the weighting factor applied -+ * to the luma prediction value for list 0 -+ * @luma_offset_l0: the additive offset applied to the luma prediction value -+ * for list 0 -+ * @delta_chroma_weight_l0: the difference of the weighting factor applied -+ * to the chroma prediction values for list 0 -+ * @chroma_offset_l0: the difference of the additive offset applied to -+ * the chroma prediction values for list 0 -+ * @delta_luma_weight_l1: the difference of the weighting factor applied -+ * to the luma prediction value for list 1 -+ * @luma_offset_l1: the additive offset applied to the luma prediction value -+ * for list 1 -+ * @delta_chroma_weight_l1: the difference of the weighting factor applied -+ * to the chroma prediction values for list 1 -+ * @chroma_offset_l1: the difference of the additive offset applied to -+ * the chroma prediction values for list 1 -+ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for -+ * all luma weighting factors -+ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm -+ * of the denominator for all chroma -+ * weighting factors -+ */ -+struct v4l2_hevc_pred_weight_table { -+ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __u8 luma_log2_weight_denom; -+ __s8 delta_chroma_log2_weight_denom; -+}; -+ -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) -+ -+/** -+ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters -+ * -+ * This control is a dynamically sized 1-dimensional array, -+ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it. -+ * -+ * @bit_size: size (in bits) of the current slice data -+ * @data_byte_offset: offset (in bytes) to the video data in the current slice data -+ * @num_entry_point_offsets: specifies the number of entry point offset syntax -+ * elements in the slice header. -+ * @nal_unit_type: specifies the coding type of the slice (B, P or I) -+ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit -+ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{} -+ * @colour_plane_id: specifies the colour plane associated with the current slice -+ * @slice_pic_order_cnt: specifies the picture order count -+ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum -+ * reference index for reference picture list 0 -+ * that may be used to decode the slice -+ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum -+ * reference index for reference picture list 1 -+ * that may be used to decode the slice -+ * @collocated_ref_idx: specifies the reference index of the collocated picture used -+ * for temporal motion vector prediction -+ * @five_minus_max_num_merge_cand: specifies the maximum number of merging -+ * motion vector prediction candidates supported in -+ * the slice subtracted from 5 -+ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding -+ * blocks in the slice -+ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset -+ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset -+ * @slice_act_y_qp_offset: screen content extension parameters -+ * @slice_act_cb_qp_offset: screen content extension parameters -+ * @slice_act_cr_qp_offset: screen content extension parameters -+ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2 -+ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2 -+ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or -+ * more fields -+ * @reserved0: padding field. Should be zeroed by applications. -+ * @slice_segment_addr: specifies the address of the first coding tree block in -+ * the slice segment -+ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB -+ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB -+ * @short_term_ref_pic_set_size: specifies the size of short-term reference -+ * pictures set included in the SPS -+ * @long_term_ref_pic_set_size: specifies the size of long-term reference -+ * pictures set include in the SPS -+ * @pred_weight_table: the prediction weight coefficients for inter-picture -+ * prediction -+ * @reserved1: padding field. Should be zeroed by applications. -+ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{} -+ */ -+struct v4l2_ctrl_hevc_slice_params { -+ __u32 bit_size; -+ __u32 data_byte_offset; -+ __u32 num_entry_point_offsets; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ -+ __u8 nal_unit_type; -+ __u8 nuh_temporal_id_plus1; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u8 slice_type; -+ __u8 colour_plane_id; -+ __s32 slice_pic_order_cnt; -+ __u8 num_ref_idx_l0_active_minus1; -+ __u8 num_ref_idx_l1_active_minus1; -+ __u8 collocated_ref_idx; -+ __u8 five_minus_max_num_merge_cand; -+ __s8 slice_qp_delta; -+ __s8 slice_cb_qp_offset; -+ __s8 slice_cr_qp_offset; -+ __s8 slice_act_y_qp_offset; -+ __s8 slice_act_cb_qp_offset; -+ __s8 slice_act_cr_qp_offset; -+ __s8 slice_beta_offset_div2; -+ __s8 slice_tc_offset_div2; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ -+ __u8 pic_struct; -+ -+ __u8 reserved0[3]; -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u32 slice_segment_addr; -+ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u16 short_term_ref_pic_set_size; -+ __u16 long_term_ref_pic_set_size; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ -+ struct v4l2_hevc_pred_weight_table pred_weight_table; -+ -+ __u8 reserved1[2]; -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 -+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 -+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 -+ -+/** -+ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters -+ * -+ * @pic_order_cnt_val: picture order count -+ * @short_term_ref_pic_set_size: specifies the size of short-term reference -+ * pictures set included in the SPS of the first slice -+ * @long_term_ref_pic_set_size: specifies the size of long-term reference -+ * pictures set include in the SPS of the first slice -+ * @num_active_dpb_entries: the number of entries in dpb -+ * @num_poc_st_curr_before: the number of reference pictures in the short-term -+ * set that come before the current frame -+ * @num_poc_st_curr_after: the number of reference pictures in the short-term -+ * set that come after the current frame -+ * @num_poc_lt_curr: the number of reference pictures in the long-term set -+ * @poc_st_curr_before: provides the index of the short term before references -+ * in DPB array -+ * @poc_st_curr_after: provides the index of the short term after references -+ * in DPB array -+ * @poc_lt_curr: provides the index of the long term references in DPB array -+ * @reserved: padding field. Should be zeroed by applications. -+ * @dpb: the decoded picture buffer, for meta-data about reference frames -+ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{} -+ */ -+struct v4l2_ctrl_hevc_decode_params { -+ __s32 pic_order_cnt_val; -+ __u16 short_term_ref_pic_set_size; -+ __u16 long_term_ref_pic_set_size; -+ __u8 num_active_dpb_entries; -+ __u8 num_poc_st_curr_before; -+ __u8 num_poc_st_curr_after; -+ __u8 num_poc_lt_curr; -+ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 reserved[4]; -+ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u64 flags; -+}; -+ -+/** -+ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters -+ * -+ * @scaling_list_4x4: scaling list is used for the scaling process for -+ * transform coefficients. The values on each scaling -+ * list are expected in raster scan order -+ * @scaling_list_8x8: scaling list is used for the scaling process for -+ * transform coefficients. The values on each scaling -+ * list are expected in raster scan order -+ * @scaling_list_16x16: scaling list is used for the scaling process for -+ * transform coefficients. The values on each scaling -+ * list are expected in raster scan order -+ * @scaling_list_32x32: scaling list is used for the scaling process for -+ * transform coefficients. The values on each scaling -+ * list are expected in raster scan order -+ * @scaling_list_dc_coef_16x16: scaling list is used for the scaling process -+ * for transform coefficients. The values on each -+ * scaling list are expected in raster scan order. -+ * @scaling_list_dc_coef_32x32: scaling list is used for the scaling process -+ * for transform coefficients. The values on each -+ * scaling list are expected in raster scan order. -+ */ -+struct v4l2_ctrl_hevc_scaling_matrix { -+ __u8 scaling_list_4x4[6][16]; -+ __u8 scaling_list_8x8[6][64]; -+ __u8 scaling_list_16x16[6][64]; -+ __u8 scaling_list_32x32[2][64]; -+ __u8 scaling_list_dc_coef_16x16[6]; -+ __u8 scaling_list_dc_coef_32x32[2]; -+}; -+ -+#endif -diff --git a/libavcodec/v4l2_req_hevc_v4.c b/libavcodec/v4l2_req_hevc_v4.c -new file mode 100644 -index 0000000000..c35579d8e0 ---- /dev/null -+++ b/libavcodec/v4l2_req_hevc_v4.c -@@ -0,0 +1,3 @@ -+#define HEVC_CTRLS_VERSION 4 -+#include "v4l2_req_hevc_vx.c" -+ -diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c -index 611fa21cc3..761c5b2dc7 100644 ---- a/libavcodec/v4l2_req_hevc_vx.c -+++ b/libavcodec/v4l2_req_hevc_vx.c -@@ -6,8 +6,6 @@ - #include "internal.h" - #include "thread.h" - --#include "v4l2_request_hevc.h" -- - #if HEVC_CTRLS_VERSION == 1 - #include "hevc-ctrls-v1.h" - -@@ -18,10 +16,37 @@ - #include "hevc-ctrls-v2.h" - #elif HEVC_CTRLS_VERSION == 3 - #include "hevc-ctrls-v3.h" -+#elif HEVC_CTRLS_VERSION == 4 -+#include -+#if !defined(V4L2_CID_STATELESS_HEVC_SPS) -+#include "hevc-ctrls-v4.h" -+#endif - #else - #error Unknown HEVC_CTRLS_VERSION - #endif - -+#ifndef V4L2_CID_STATELESS_HEVC_SPS -+#define V4L2_CID_STATELESS_HEVC_SPS V4L2_CID_MPEG_VIDEO_HEVC_SPS -+#define V4L2_CID_STATELESS_HEVC_PPS V4L2_CID_MPEG_VIDEO_HEVC_PPS -+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS -+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX -+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS -+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE -+#define V4L2_CID_STATELESS_HEVC_START_CODE V4L2_CID_MPEG_VIDEO_HEVC_START_CODE -+ -+#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED -+#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED -+#define V4L2_STATELESS_HEVC_START_CODE_NONE V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE -+#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B -+#endif -+ -+// Should be in videodev2 but we might not have a good enough one -+#ifndef V4L2_PIX_FMT_HEVC_SLICE -+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ -+#endif -+ -+#include "v4l2_request_hevc.h" -+ - #include "libavutil/hwcontext_drm.h" - - #include -@@ -259,9 +284,13 @@ fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const - #endif - entry->field_pic = frame->frame->interlaced_frame; - -+#if HEVC_CTRLS_VERSION <= 3 - /* TODO: Interleaved: Get the POC for each field. */ - entry->pic_order_cnt[0] = frame->poc; - entry->pic_order_cnt[1] = frame->poc; -+#else -+ entry->pic_order_cnt_val = frame->poc; -+#endif - } - } - return n; -@@ -287,8 +316,11 @@ static void fill_slice_params(const HEVCContext * const h, - - *slice_params = (struct v4l2_ctrl_hevc_slice_params) { - .bit_size = bit_size, -+#if HEVC_CTRLS_VERSION <= 3 - .data_bit_offset = bit_offset, -- -+#else -+ .data_byte_offset = bit_offset / 8 + 1, -+#endif - /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ - .slice_segment_addr = sh->slice_segment_addr, - -@@ -376,8 +408,10 @@ static void fill_slice_params(const HEVCContext * const h, - av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets); - } - -+#if HEVC_CTRLS_VERSION <= 3 - for (i = 0; i < slice_params->num_entry_point_offsets; i++) - slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1; -+#endif - } - - #if HEVC_CTRLS_VERSION >= 2 -@@ -761,30 +795,30 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, - - struct v4l2_ext_control control[] = { - { -- .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS, -+ .id = V4L2_CID_STATELESS_HEVC_SPS, - .ptr = &controls->sps, - .size = sizeof(controls->sps), - }, - { -- .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS, -+ .id = V4L2_CID_STATELESS_HEVC_PPS, - .ptr = &controls->pps, - .size = sizeof(controls->pps), - }, - #if HEVC_CTRLS_VERSION >= 2 - { -- .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS, -+ .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS, - .ptr = dec, - .size = sizeof(*dec), - }, - #endif - { -- .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, -+ .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, - .ptr = slices + slice_no, - .size = sizeof(*slices) * slice_count, - }, - // Optional - { -- .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX, -+ .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX, - .ptr = &controls->scaling_matrix, - .size = sizeof(controls->scaling_matrix), - }, -@@ -1000,12 +1034,12 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - - // Check for var slice array - struct v4l2_query_ext_ctrl qc[] = { -- { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS }, -- { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS }, -- { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS }, -- { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX }, -+ { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS }, -+ { .id = V4L2_CID_STATELESS_HEVC_SPS }, -+ { .id = V4L2_CID_STATELESS_HEVC_PPS }, -+ { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX }, - #if HEVC_CTRLS_VERSION >= 2 -- { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS }, -+ { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS }, - #endif - }; - // Order & size must match! -@@ -1042,12 +1076,13 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - - fill_sps(&ctrl_sps, sps); - -- if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) { -+ if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) { - av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n"); - return AVERROR(EINVAL); - } - - ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0; -+ av_log(avctx, AV_LOG_INFO, "%s SPS muti-slice\n", ctx->multi_slice ? "Has" : "No"); - return 0; - } - -@@ -1058,29 +1093,29 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - int ret; - - struct v4l2_query_ext_ctrl querys[] = { -- { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, }, -- { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, }, -- { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, }, -+ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, -+ { .id = V4L2_CID_STATELESS_HEVC_START_CODE, }, -+ { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, }, - }; - - struct v4l2_ext_control ctrls[] = { -- { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, }, -- { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, }, -+ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, -+ { .id = V4L2_CID_STATELESS_HEVC_START_CODE, }, - }; - - mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys)); - - ctx->decode_mode = querys[0].default_value; - -- if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED && -- ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) { -+ if (ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED && -+ ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) { - av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode); - return AVERROR(EINVAL); - } - - ctx->start_code = querys[1].default_value; -- if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE && -- ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) { -+ if (ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_NONE && -+ ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) { - av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code); - return AVERROR(EINVAL); - } -diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c -index 20e4e0ab15..cd79aad563 100644 ---- a/libavcodec/v4l2_request_hevc.c -+++ b/libavcodec/v4l2_request_hevc.c -@@ -210,7 +210,11 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) - goto fail4; - } - -- if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) { -+ if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) { -+ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n"); -+ ctx->fns = &V2(ff_v4l2_req_hevc, 4); -+ } -+ else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) { - av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n"); - ctx->fns = &V2(ff_v4l2_req_hevc, 3); - } -diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h -index ed48d62e2d..d4adb3f812 100644 ---- a/libavcodec/v4l2_request_hevc.h -+++ b/libavcodec/v4l2_request_hevc.h -@@ -99,5 +99,6 @@ typedef struct v4l2_req_decode_fns { - extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1); - extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2); - extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3); -+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4); - - #endif - -From 21a348ae3282318fa96d3a6e2c70f3d4b90a7d52 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Sun, 3 Jul 2022 13:40:41 +0000 -Subject: [PATCH 057/136] v4l2_req: Observe limit on size of slice_array - -This in fact provides some minor simplifications by combing the -multi-slice and single-slice paths. - -(cherry picked from commit 7631e6d1a66fca9048605c214f3464c90d37932c) ---- - libavcodec/v4l2_req_hevc_vx.c | 39 ++++++++++++++-------------------- - libavcodec/v4l2_request_hevc.h | 5 +---- - 2 files changed, 17 insertions(+), 27 deletions(-) - -diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c -index 761c5b2dc7..9d08d13d9e 100644 ---- a/libavcodec/v4l2_req_hevc_vx.c -+++ b/libavcodec/v4l2_req_hevc_vx.c -@@ -840,18 +840,21 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t * - int bcount = get_bits_count(&h->HEVClc->gb); - uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount; - -+ const unsigned int n = rd->num_slices; -+ const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices; -+ - int rv; - struct slice_info * si; - - if ((rv = slice_add(rd)) != 0) - return rv; - -- si = rd->slices + rd->num_slices - 1; -+ si = rd->slices + n; - si->ptr = buffer; - si->len = size; - -- if (ctx->multi_slice && rd->num_slices > 1) { -- struct slice_info *const si0 = rd->slices; -+ if (n != block_start) { -+ struct slice_info *const si0 = rd->slices + block_start; - const size_t offset = (buffer - si0->ptr); - boff += offset * 8; - size += offset; -@@ -859,11 +862,11 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t * - } - - #if HEVC_CTRLS_VERSION >= 2 -- if (rd->num_slices == 1) -+ if (n == 0) - fill_decode_params(h, &rd->dec); -- fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff); -+ fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff); - #else -- fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff); -+ fill_slice_params(h, rd->slice_params + n, size * 8, boff); - #endif - - return 0; -@@ -997,18 +1000,11 @@ static int v4l2_request_hevc_end_frame(AVCodecContext *avctx) - } - - // Send as slices -- if (ctx->multi_slice) -- { -- if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0) -+ for (i = 0; i < rd->num_slices; i += ctx->max_slices) { -+ const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices); -+ if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0) - goto fail; - } -- else -- { -- for (i = 0; i != rd->num_slices; ++i) { -- if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0) -- goto fail; -- } -- } - - // Set the drm_prime desriptor - drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs)); -@@ -1081,8 +1077,6 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - return AVERROR(EINVAL); - } - -- ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0; -- av_log(avctx, AV_LOG_INFO, "%s SPS muti-slice\n", ctx->multi_slice ? "Has" : "No"); - return 0; - } - -@@ -1120,11 +1114,10 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - return AVERROR(EINVAL); - } - -- ctx->max_slices = querys[2].elems; -- if (ctx->max_slices > MAX_SLICES) { -- av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices); -- return AVERROR(EINVAL); -- } -+ ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) || -+ querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ? -+ 1 : querys[2].dims[0]; -+ av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices); - - ctrls[0].value = ctx->decode_mode; - ctrls[1].value = ctx->start_code; -diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h -index d4adb3f812..0029e23309 100644 ---- a/libavcodec/v4l2_request_hevc.h -+++ b/libavcodec/v4l2_request_hevc.h -@@ -46,8 +46,6 @@ - #define V4L2_CTRL_FLAG_DYNAMIC_ARRAY 0x0800 - #endif - --#define MAX_SLICES 128 -- - #define VCAT(name, version) name##_v##version - #define V2(n,v) VCAT(n, v) - #define V(n) V2(n, HEVC_CTRLS_VERSION) -@@ -64,10 +62,9 @@ typedef struct V4L2RequestContextHEVC { - - unsigned int timestamp; // ?? maybe uint64_t - -- int multi_slice; - int decode_mode; - int start_code; -- int max_slices; -+ unsigned int max_slices; - - req_decode_q decode_q; - - -From 4f1d74cc8eea6a1bd6f2317a10c0ecf620315dec Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 4 Jul 2022 14:43:20 +0100 -Subject: [PATCH 058/136] v4l2_req: Add entry point offsets array control - ---- - libavcodec/v4l2_req_hevc_vx.c | 88 +++++++++++++++++++++++++++------- - libavcodec/v4l2_request_hevc.h | 3 +- - 2 files changed, 72 insertions(+), 19 deletions(-) - -diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c -index 9d08d13d9e..43ef6631ed 100644 ---- a/libavcodec/v4l2_req_hevc_vx.c -+++ b/libavcodec/v4l2_req_hevc_vx.c -@@ -82,11 +82,16 @@ typedef struct V4L2MediaReqDescriptor { - struct v4l2_ctrl_hevc_slice_params * slice_params; - struct slice_info * slices; - -+ size_t num_offsets; -+ size_t alloced_offsets; -+ uint32_t *offsets; -+ - } V4L2MediaReqDescriptor; - - struct slice_info { - const uint8_t * ptr; - size_t len; // bytes -+ size_t n_offsets; - }; - - // Handy container for accumulating controls before setting -@@ -245,7 +250,7 @@ static int slice_add(V4L2MediaReqDescriptor * const rd) - if (rd->num_slices >= rd->alloced_slices) { - struct v4l2_ctrl_hevc_slice_params * p2; - struct slice_info * s2; -- size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2; -+ size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2; - - p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2)); - if (p2 == NULL) -@@ -263,6 +268,23 @@ static int slice_add(V4L2MediaReqDescriptor * const rd) - return 0; - } - -+static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets) -+{ -+ if (rd->num_offsets + n > rd->alloced_offsets) { -+ size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2; -+ void * p2; -+ while (rd->num_offsets + n > n2) -+ n2 *= 2; -+ if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL) -+ return AVERROR(ENOMEM); -+ rd->offsets = p2; -+ rd->alloced_offsets = n2; -+ } -+ for (size_t i = 0; i != n; ++i) -+ rd->offsets[rd->num_offsets++] = offsets[i] - 1; -+ return 0; -+} -+ - static unsigned int - fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries) - { -@@ -403,12 +425,12 @@ static void fill_slice_params(const HEVCContext * const h, - fill_pred_table(h, &slice_params->pred_weight_table); - - slice_params->num_entry_point_offsets = sh->num_entry_point_offsets; -+#if HEVC_CTRLS_VERSION <= 3 - if (slice_params->num_entry_point_offsets > 256) { - slice_params->num_entry_point_offsets = 256; - av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets); - } - --#if HEVC_CTRLS_VERSION <= 3 - for (i = 0; i < slice_params->num_entry_point_offsets; i++) - slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1; - #endif -@@ -787,13 +809,17 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, - #if HEVC_CTRLS_VERSION >= 2 - struct v4l2_ctrl_hevc_decode_params * const dec, - #endif -- struct v4l2_ctrl_hevc_slice_params * const slices, -- const unsigned int slice_no, -- const unsigned int slice_count) -+ struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count, -+ void * const offsets, const size_t offset_count) - { - int rv; -+#if HEVC_CTRLS_VERSION >= 2 -+ unsigned int n = 4; -+#else -+ unsigned int n = 3; -+#endif - -- struct v4l2_ext_control control[] = { -+ struct v4l2_ext_control control[6] = { - { - .id = V4L2_CID_STATELESS_HEVC_SPS, - .ptr = &controls->sps, -@@ -813,21 +839,28 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, - #endif - { - .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, -- .ptr = slices + slice_no, -+ .ptr = slices, - .size = sizeof(*slices) * slice_count, - }, -- // Optional -- { -+ }; -+ -+ if (controls->has_scaling) -+ control[n++] = (struct v4l2_ext_control) { - .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX, - .ptr = &controls->scaling_matrix, - .size = sizeof(controls->scaling_matrix), -- }, -- }; -+ }; -+ -+#if HEVC_CTRLS_VERSION >= 4 -+ if (offsets) -+ control[n++] = (struct v4l2_ext_control) { -+ .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, -+ .ptr = offsets, -+ .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count, -+ }; -+#endif - -- rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, -- controls->has_scaling ? -- FF_ARRAY_ELEMS(control) : -- FF_ARRAY_ELEMS(control) - 1); -+ rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n); - - return rv; - } -@@ -852,6 +885,7 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t * - si = rd->slices + n; - si->ptr = buffer; - si->len = size; -+ si->n_offsets = rd->num_offsets; - - if (n != block_start) { - struct slice_info *const si0 = rd->slices + block_start; -@@ -868,6 +902,9 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t * - #else - fill_slice_params(h, rd->slice_params + n, size * 8, boff); - #endif -+ if (ctx->max_offsets != 0 && -+ (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0) -+ return rv; - - return 0; - } -@@ -893,10 +930,13 @@ static int send_slice(AVCodecContext * const avctx, - { - V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; - -+ const int is_last = (j == rd->num_slices); - struct slice_info *const si = rd->slices + i; - struct media_request * req = NULL; - struct qent_src * src = NULL; - MediaBufsStatus stat; -+ void * offsets = rd->offsets + rd->slices[i].n_offsets; -+ size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets; - - if ((req = media_request_get(ctx->mpool)) == NULL) { - av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__); -@@ -908,8 +948,8 @@ static int send_slice(AVCodecContext * const avctx, - #if HEVC_CTRLS_VERSION >= 2 - &rd->dec, - #endif -- rd->slice_params, -- i, j - i)) { -+ rd->slice_params + i, j - i, -+ offsets, n_offsets)) { - av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__); - goto fail1; - } -@@ -935,7 +975,7 @@ static int send_slice(AVCodecContext * const avctx, - - stat = mediabufs_start_request(ctx->mbufs, &req, &src, - i == 0 ? rd->qe_dst : NULL, -- j == rd->num_slices); -+ is_last); - - if (stat != MEDIABUFS_STATUS_SUCCESS) { - av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__); -@@ -1090,6 +1130,9 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, - { .id = V4L2_CID_STATELESS_HEVC_START_CODE, }, - { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, }, -+#if HEVC_CTRLS_VERSION >= 4 -+ { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, }, -+#endif - }; - - struct v4l2_ext_control ctrls[] = { -@@ -1119,6 +1162,14 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - 1 : querys[2].dims[0]; - av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices); - -+#if HEVC_CTRLS_VERSION >= 4 -+ ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ? -+ 0 : querys[3].dims[0]; -+ av_log(avctx, AV_LOG_INFO, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets); -+#else -+ ctx->max_offsets = 0; -+#endif -+ - ctrls[0].value = ctx->decode_mode; - ctrls[1].value = ctx->start_code; - -@@ -1141,6 +1192,7 @@ static void v4l2_req_frame_free(void *opaque, uint8_t *data) - - av_freep(&rd->slices); - av_freep(&rd->slice_params); -+ av_freep(&rd->offsets); - - av_free(rd); - } -diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h -index 0029e23309..99c90064ea 100644 ---- a/libavcodec/v4l2_request_hevc.h -+++ b/libavcodec/v4l2_request_hevc.h -@@ -64,7 +64,8 @@ typedef struct V4L2RequestContextHEVC { - - int decode_mode; - int start_code; -- unsigned int max_slices; -+ unsigned int max_slices; // 0 => not wanted (frame mode) -+ unsigned int max_offsets; // 0 => not wanted - - req_decode_q decode_q; - - -From d0e5ed2dff1b8f8909ceb968cb3afe2b20093fda Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 4 Jul 2022 16:22:54 +0100 -Subject: [PATCH 059/136] v4l2_req: Support Annex B - ---- - libavcodec/v4l2_req_hevc_vx.c | 61 +++++++++++++++++++++++------------ - 1 file changed, 41 insertions(+), 20 deletions(-) - -diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c -index 43ef6631ed..5e0db9850a 100644 ---- a/libavcodec/v4l2_req_hevc_vx.c -+++ b/libavcodec/v4l2_req_hevc_vx.c -@@ -879,6 +879,18 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t * - int rv; - struct slice_info * si; - -+ // This looks dodgy but we know that FFmpeg has parsed this from a buffer -+ // that contains the entire frame including the start code -+ if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) { -+ buffer -= 3; -+ size += 3; -+ boff += 24; -+ if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) { -+ av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n", -+ buffer[0], buffer[1], buffer[2]); -+ } -+ } -+ - if ((rv = slice_add(rd)) != 0) - return rv; - -@@ -969,10 +981,6 @@ static int send_slice(AVCodecContext * const avctx, - goto fail2; - } - --#warning ANNEX_B start code --// if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) { --// } -- - stat = mediabufs_start_request(ctx->mbufs, &req, &src, - i == 0 ? rd->qe_dst : NULL, - is_last); -@@ -1120,6 +1128,12 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - return 0; - } - -+static inline int -+ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v) -+{ -+ return v >= c->minimum && v <= c->maximum; -+} -+ - // Final init - static int - set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) -@@ -1142,21 +1156,6 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - - mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys)); - -- ctx->decode_mode = querys[0].default_value; -- -- if (ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED && -- ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) { -- av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode); -- return AVERROR(EINVAL); -- } -- -- ctx->start_code = querys[1].default_value; -- if (ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_NONE && -- ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) { -- av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code); -- return AVERROR(EINVAL); -- } -- - ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) || - querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ? - 1 : querys[2].dims[0]; -@@ -1165,11 +1164,33 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - #if HEVC_CTRLS_VERSION >= 4 - ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ? - 0 : querys[3].dims[0]; -- av_log(avctx, AV_LOG_INFO, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets); -+ av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets); - #else - ctx->max_offsets = 0; - #endif - -+ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B; -+ -+ if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) -+ { -+ ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED; -+ -+ // Prefer NONE as it doesn't require the slightly dodgy look -+ // backwards in our raw buffer -+ if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE)) -+ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE; -+ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)) -+ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B; -+ else { -+ av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__); -+ return AVERROR(EINVAL); -+ } -+ } -+ else -+ { -+ av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__); -+ } -+ - ctrls[0].value = ctx->decode_mode; - ctrls[1].value = ctx->start_code; - - -From a75506e18a964c9f50efa224a3fa4179c9ef2127 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 4 Jul 2022 18:24:03 +0100 -Subject: [PATCH 060/136] v4l2_req: Add frame mode decode - ---- - libavcodec/v4l2_req_hevc_vx.c | 69 +++++++++++++++++++++++------------ - 1 file changed, 46 insertions(+), 23 deletions(-) - -diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c -index 5e0db9850a..ada53d0d44 100644 ---- a/libavcodec/v4l2_req_hevc_vx.c -+++ b/libavcodec/v4l2_req_hevc_vx.c -@@ -814,9 +814,9 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, - { - int rv; - #if HEVC_CTRLS_VERSION >= 2 -- unsigned int n = 4; --#else - unsigned int n = 3; -+#else -+ unsigned int n = 2; - #endif - - struct v4l2_ext_control control[6] = { -@@ -837,12 +837,14 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, - .size = sizeof(*dec), - }, - #endif -- { -+ }; -+ -+ if (slices) -+ control[n++] = (struct v4l2_ext_control) { - .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, - .ptr = slices, - .size = sizeof(*slices) * slice_count, -- }, -- }; -+ }; - - if (controls->has_scaling) - control[n++] = (struct v4l2_ext_control) { -@@ -865,6 +867,8 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, - return rv; - } - -+// This only works because we started out from a single coded frame buffer -+// that will remain intact until after end_frame - static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) - { - const HEVCContext * const h = avctx->priv_data; -@@ -891,6 +895,17 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t * - } - } - -+ if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) { -+ if (rd->slices == NULL) { -+ if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL) -+ return AVERROR(ENOMEM); -+ rd->slices->ptr = buffer; -+ rd->num_slices = 1; -+ } -+ rd->slices->len = buffer - rd->slices->ptr + size; -+ return 0; -+ } -+ - if ((rv = slice_add(rd)) != 0) - return rv; - -@@ -1169,28 +1184,36 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - ctx->max_offsets = 0; - #endif - -- ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B; -- -- if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) -- { -+ if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED || -+ querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) -+ ctx->decode_mode = querys[0].default_value; -+ else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)) -+ ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED; -+ else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) - ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED; -- -- // Prefer NONE as it doesn't require the slightly dodgy look -- // backwards in our raw buffer -- if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE)) -- ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE; -- else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)) -- ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B; -- else { -- av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__); -- return AVERROR(EINVAL); -- } -- } -- else -- { -+ else { - av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__); -+ return AVERROR(EINVAL); - } - -+ if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE || -+ querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) -+ ctx->start_code = querys[1].default_value; -+ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)) -+ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B; -+ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE)) -+ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE; -+ else { -+ av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__); -+ return AVERROR(EINVAL); -+ } -+ -+ // If we are in slice mode & START_CODE_NONE supported then pick that -+ // as it doesn't require the slightly dodgy look backwards in our raw buffer -+ if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED && -+ ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE)) -+ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE; -+ - ctrls[0].value = ctx->decode_mode; - ctrls[1].value = ctx->start_code; - - -From 9cf01f1485dcf71bcad7981d45029425d9abf115 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 5 Jul 2022 12:54:22 +0000 -Subject: [PATCH 061/136] v4l2_req: Fix probe for frame based decode - ---- - libavcodec/v4l2_req_hevc_vx.c | 33 +++++++++++++++++++++++---------- - 1 file changed, 23 insertions(+), 10 deletions(-) - -diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c -index ada53d0d44..5d083016f8 100644 ---- a/libavcodec/v4l2_req_hevc_vx.c -+++ b/libavcodec/v4l2_req_hevc_vx.c -@@ -1082,6 +1082,12 @@ fail: - return rv; - } - -+static inline int -+ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v) -+{ -+ return v >= c->minimum && v <= c->maximum; -+} -+ - // Initial check & init - static int - probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) -@@ -1094,6 +1100,7 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - // Check for var slice array - struct v4l2_query_ext_ctrl qc[] = { - { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS }, -+ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, - { .id = V4L2_CID_STATELESS_HEVC_SPS }, - { .id = V4L2_CID_STATELESS_HEVC_PPS }, - { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX }, -@@ -1104,6 +1111,7 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - // Order & size must match! - static const size_t ctrl_sizes[] = { - sizeof(struct v4l2_ctrl_hevc_slice_params), -+ sizeof(int32_t), - sizeof(struct v4l2_ctrl_hevc_sps), - sizeof(struct v4l2_ctrl_hevc_pps), - sizeof(struct v4l2_ctrl_hevc_scaling_matrix), -@@ -1121,11 +1129,22 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - return AVERROR(EINVAL); - #endif - -- if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) { -- av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION); -+ mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls); -+ i = 0; -+#if HEVC_CTRLS_VERSION >= 4 -+ // Skip slice check if no slice mode -+ if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) -+ i = 1; -+#else -+ // Fail frame mode silently for anything prior to V4 -+ if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) - return AVERROR(EINVAL); -- } -- for (i = 0; i != noof_ctrls; ++i) { -+#endif -+ for (; i != noof_ctrls; ++i) { -+ if (qc[i].type == 0) { -+ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id); -+ return AVERROR(EINVAL); -+ } - if (ctrl_sizes[i] != (size_t)qc[i].elem_size) { - av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n", - HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size); -@@ -1143,12 +1162,6 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - return 0; - } - --static inline int --ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v) --{ -- return v >= c->minimum && v <= c->maximum; --} -- - // Final init - static int - set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) - -From e7a62226f26073149d35c89268f56e17c8f45d76 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 26 Jul 2022 15:46:14 +0000 -Subject: [PATCH 062/136] vf_deinterlace_v4l2m2m: Support NV12 through - deinterlace - -Supports NV12 (though not yet NV12M) through deinterlace. -Also improves error handling such that attempting to deinterlace an -unsupported drm format causes an error. -No longer leaks frame structures. ---- - libavfilter/vf_deinterlace_v4l2m2m.c | 160 ++++++++++++++++++--------- - 1 file changed, 107 insertions(+), 53 deletions(-) - -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -index 1a933b7e0a..1a3bef5bcb 100644 ---- a/libavfilter/vf_deinterlace_v4l2m2m.c -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -373,14 +373,16 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue) - fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline); - - if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -- if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 || -+ if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 && -+ fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) || - fmt->fmt.pix_mp.field != field) { - av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); - - return AVERROR(EINVAL); - } - } else { -- if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 || -+ if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 && -+ fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) || - fmt->fmt.pix.field != field) { - av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); - -@@ -391,7 +393,7 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue) - return 0; - } - --static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height, int pitch, int ysize) -+static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize) - { - struct v4l2_format *fmt = &queue->format; - DeintV4L2M2MContextShared *ctx = queue->ctx; -@@ -402,13 +404,16 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, - .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS, - }; - -+ // This works for most single object 4:2:0 types - if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ fmt->fmt.pix_mp.pixelformat = pixelformat; - fmt->fmt.pix_mp.field = field; - fmt->fmt.pix_mp.width = width; - fmt->fmt.pix_mp.height = ysize / pitch; - fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch; - fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1); - } else { -+ fmt->fmt.pix.pixelformat = pixelformat; - fmt->fmt.pix.field = field; - fmt->fmt.pix.width = width; - fmt->fmt.pix.height = height; -@@ -417,12 +422,22 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, - } - - ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt); -- if (ret) -+ if (ret) { -+ ret = AVERROR(errno); - av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret); -+ return ret; -+ } -+ -+ if (pixelformat != fmt->fmt.pix.pixelformat) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat)); -+ return AVERROR(EINVAL); -+ } - - ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel); -- if (ret) -- av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_SELECTION failed: %d\n", ret); -+ if (ret) { -+ ret = AVERROR(errno); -+ av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret); -+ } - - sel.r.width = width; - sel.r.height = height; -@@ -432,10 +447,12 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, - sel.flags = V4L2_SEL_FLAG_LE; - - ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel); -- if (ret) -- av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_SELECTION failed: %d\n", ret); -+ if (ret) { -+ ret = AVERROR(errno); -+ av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret); -+ } - -- return ret; -+ return 0; - } - - static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node) -@@ -517,10 +534,25 @@ static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf) - return 0; - } - --static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) -+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat) - { - struct v4l2_exportbuffer expbuf; - int i, ret; -+ uint64_t mod = DRM_FORMAT_MOD_LINEAR; -+ uint32_t fmt = 0; -+ -+ switch (pixelformat) { -+ case V4L2_PIX_FMT_NV12: -+ fmt = DRM_FORMAT_NV12; -+ break; -+ case V4L2_PIX_FMT_YUV420: -+ fmt = DRM_FORMAT_YUV420; -+ break; -+ default: -+ return AVERROR(EINVAL); -+ } -+ -+ avbuf->drm_frame.layers[0].format = fmt; - - for (i = 0; i < avbuf->num_planes; i++) { - memset(&expbuf, 0, sizeof(expbuf)); -@@ -539,12 +571,12 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) - /* drm frame */ - avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length; - avbuf->drm_frame.objects[i].fd = expbuf.fd; -- avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ avbuf->drm_frame.objects[i].format_modifier = mod; - } else { - /* drm frame */ - avbuf->drm_frame.objects[0].size = avbuf->buffer.length; - avbuf->drm_frame.objects[0].fd = expbuf.fd; -- avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ avbuf->drm_frame.objects[0].format_modifier = mod; - } - } - -@@ -629,7 +661,7 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) - if (ret) - goto fail; - -- ret = v4l2_buffer_export_drm(buf); -+ ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat); - if (ret) - goto fail; - } -@@ -878,7 +910,6 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused) - - static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) - { -- int av_pix_fmt = AV_PIX_FMT_YUV420P; - AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; - AVDRMLayerDescriptor *layer; - -@@ -895,20 +926,13 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) - layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; - } - -- switch (av_pix_fmt) { -- case AV_PIX_FMT_YUYV422: -- -- layer->format = DRM_FORMAT_YUYV; -+ switch (layer->format) { -+ case DRM_FORMAT_YUYV: - layer->nb_planes = 1; -- - break; - -- case AV_PIX_FMT_NV12: -- case AV_PIX_FMT_NV21: -- -- layer->format = av_pix_fmt == AV_PIX_FMT_NV12 ? -- DRM_FORMAT_NV12 : DRM_FORMAT_NV21; -- -+ case DRM_FORMAT_NV12: -+ case DRM_FORMAT_NV21: - if (avbuf->num_planes > 1) - break; - -@@ -920,10 +944,7 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) - layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; - break; - -- case AV_PIX_FMT_YUV420P: -- -- layer->format = DRM_FORMAT_YUV420; -- -+ case DRM_FORMAT_YUV420: - if (avbuf->num_planes > 1) - break; - -@@ -1032,6 +1053,26 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink) - return 0; - } - -+static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc) -+{ -+ const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR || -+ drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID); -+ -+ switch (drm_desc->layers[0].format) { -+ case DRM_FORMAT_YUV420: -+ if (is_linear) -+ return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0; -+ break; -+ case DRM_FORMAT_NV12: -+ if (is_linear) -+ return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0; -+ break; -+ default: -+ break; -+ } -+ return 0; -+} -+ - static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) - { - AVFilterContext *avctx = link->dst; -@@ -1047,23 +1088,27 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) - avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out); - - if (ctx->field_order == V4L2_FIELD_ANY) { -- AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0]; -+ const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0]; -+ const uint32_t pixelformat = desc_pixelformat(drm_desc); -+ -+ if (pixelformat == 0) { -+ av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n", -+ av_fourcc2str(drm_desc->layers[0].format), -+ drm_desc->nb_objects, drm_desc->objects[0].format_modifier); -+ return AVERROR(EINVAL); -+ } -+ - ctx->orig_width = drm_desc->layers[0].planes[0].pitch; - ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width; - - av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height, - drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset); - -- if (in->top_field_first) -- ctx->field_order = V4L2_FIELD_INTERLACED_TB; -- else -- ctx->field_order = V4L2_FIELD_INTERLACED_BT; -- -- ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); -+ ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); - if (ret) - return ret; - -- ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); -+ ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); - if (ret) - return ret; - -@@ -1082,6 +1127,12 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) - ret = deint_v4l2m2m_streamon(output); - if (ret) - return ret; -+ -+ if (in->top_field_first) -+ ctx->field_order = V4L2_FIELD_INTERLACED_TB; -+ else -+ ctx->field_order = V4L2_FIELD_INTERLACED_BT; -+ - } - - ret = deint_v4l2m2m_enqueue_frame(output, in); -@@ -1157,28 +1208,31 @@ again: - return 0; - } - -- { -+ recycle_q(&s->output); -+ n = count_enqueued(&s->output); -+ -+ while (n < 6) { - AVFrame * frame; - int rv; - -- recycle_q(&s->output); -- n = count_enqueued(&s->output); -+ if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) { -+ av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv)); -+ return rv; -+ } - -- while (n < 6) { -- if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) { -- av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv)); -- return rv; -- } -+ if (frame == NULL) { -+ av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__); -+ break; -+ } - -- if (frame == NULL) { -- av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__); -- break; -- } -+ rv = deint_v4l2m2m_filter_frame(inlink, frame); -+ av_frame_free(&frame); - -- deint_v4l2m2m_filter_frame(inlink, frame); -- av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__); -- ++n; -- } -+ if (rv != 0) -+ return rv; -+ -+ av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__); -+ ++n; - } - - if (n < 6) { - -From 3d07826bcf588ad0384d00b210415664aa4489fb Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Fri, 19 Aug 2022 15:29:11 +0000 -Subject: [PATCH 063/136] v4l2_req: Enable use of MMAP for buffer alloc - -Use MMAP rather than DMABUF if either the dmabuf device can't be opened -or create_buf doesn't set the capability. ---- - libavcodec/v4l2_req_dmabufs.c | 22 +++ - libavcodec/v4l2_req_dmabufs.h | 3 + - libavcodec/v4l2_req_media.c | 263 ++++++++++++++++++++++++++++----- - libavcodec/v4l2_req_media.h | 21 ++- - libavcodec/v4l2_request_hevc.c | 42 +++++- - 5 files changed, 307 insertions(+), 44 deletions(-) - -diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c -index ae6c648369..c4bbed18c6 100644 ---- a/libavcodec/v4l2_req_dmabufs.c -+++ b/libavcodec/v4l2_req_dmabufs.c -@@ -36,6 +36,26 @@ static unsigned int total_bufs = 0; - static size_t total_size = 0; - #endif - -+struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size) -+{ -+ struct dmabuf_h *dh; -+ -+ if (mapptr == MAP_FAILED) -+ return NULL; -+ -+ dh = malloc(sizeof(*dh)); -+ if (!dh) -+ return NULL; -+ -+ *dh = (struct dmabuf_h) { -+ .fd = -1, -+ .size = size, -+ .mapptr = mapptr -+ }; -+ -+ return dh; -+} -+ - struct dmabuf_h * dmabuf_import(int fd, size_t size) - { - struct dmabuf_h *dh; -@@ -122,6 +142,8 @@ int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags) - struct dma_buf_sync sync = { - .flags = flags - }; -+ if (dh->fd == -1) -+ return 0; - while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) { - const int err = errno; - if (errno == EINTR) -diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h -index cfb17e801d..c1d3d8c8d7 100644 ---- a/libavcodec/v4l2_req_dmabufs.h -+++ b/libavcodec/v4l2_req_dmabufs.h -@@ -18,6 +18,9 @@ static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t s - } - /* Create from existing fd - dups(fd) */ - struct dmabuf_h * dmabuf_import(int fd, size_t size); -+/* Import an MMAP - return NULL if mapptr = MAP_FAIL */ -+struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size); -+ - void * dmabuf_map(struct dmabuf_h * const dh); - - /* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */ -diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c -index 980b306b8a..910ac77bb6 100644 ---- a/libavcodec/v4l2_req_media.c -+++ b/libavcodec/v4l2_req_media.c -@@ -33,9 +33,11 @@ - #include - #include - #include -+#include - #include - #include - #include -+#include - - #include - -@@ -95,6 +97,32 @@ struct media_request { - struct polltask * pt; - }; - -+static inline enum v4l2_memory -+mediabufs_memory_to_v4l2(const enum mediabufs_memory m) -+{ -+ return (enum v4l2_memory)m; -+} -+ -+const char * -+mediabufs_memory_name(const enum mediabufs_memory m) -+{ -+ switch (m) { -+ case MEDIABUFS_MEMORY_UNSET: -+ return "Unset"; -+ case MEDIABUFS_MEMORY_MMAP: -+ return "MMap"; -+ case MEDIABUFS_MEMORY_USERPTR: -+ return "UserPtr"; -+ case MEDIABUFS_MEMORY_OVERLAY: -+ return "Overlay"; -+ case MEDIABUFS_MEMORY_DMABUF: -+ return "DMABuf"; -+ default: -+ break; -+ } -+ return "Unknown"; -+} -+ - - static inline int do_trywait(sem_t *const sem) - { -@@ -115,14 +143,14 @@ static inline int do_wait(sem_t *const sem) - } - - static int request_buffers(int video_fd, unsigned int type, -- enum v4l2_memory memory, unsigned int buffers_count) -+ enum mediabufs_memory memory, unsigned int buffers_count) - { - struct v4l2_requestbuffers buffers; - int rc; - - memset(&buffers, 0, sizeof(buffers)); - buffers.type = type; -- buffers.memory = memory; -+ buffers.memory = mediabufs_memory_to_v4l2(memory); - buffers.count = buffers_count; - - rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers); -@@ -324,6 +352,7 @@ struct qent_base { - struct qent_base *next; - struct qent_base *prev; - enum qent_status status; -+ enum mediabufs_memory memtype; - uint32_t index; - struct dmabuf_h *dh[VIDEO_MAX_PLANES]; - struct timeval timestamp; -@@ -348,9 +377,9 @@ struct qe_list_head { - }; - - struct buf_pool { -+ enum mediabufs_memory memtype; - pthread_mutex_t lock; - sem_t free_sem; -- enum v4l2_buf_type buf_type; - struct qe_list_head free; - struct qe_list_head inuse; - }; -@@ -367,9 +396,10 @@ static inline struct qent_src *base_to_src(struct qent_base *be) - } - - --#define QENT_BASE_INITIALIZER {\ -+#define QENT_BASE_INITIALIZER(mtype) {\ - .ref_count = ATOMIC_VAR_INIT(0),\ - .status = QENT_NEW,\ -+ .memtype = (mtype),\ - .index = INDEX_UNSET\ - } - -@@ -390,13 +420,13 @@ static void qe_src_free(struct qent_src *const be_src) - free(be_src); - } - --static struct qent_src * qe_src_new(void) -+static struct qent_src * qe_src_new(enum mediabufs_memory mtype) - { - struct qent_src *const be_src = malloc(sizeof(*be_src)); - if (!be_src) - return NULL; - *be_src = (struct qent_src){ -- .base = QENT_BASE_INITIALIZER -+ .base = QENT_BASE_INITIALIZER(mtype) - }; - return be_src; - } -@@ -413,13 +443,13 @@ static void qe_dst_free(struct qent_dst *const be_dst) - free(be_dst); - } - --static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl) -+static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl, const enum mediabufs_memory memtype) - { - struct qent_dst *const be_dst = malloc(sizeof(*be_dst)); - if (!be_dst) - return NULL; - *be_dst = (struct qent_dst){ -- .base = QENT_BASE_INITIALIZER, -+ .base = QENT_BASE_INITIALIZER(memtype), - .lock = PTHREAD_MUTEX_INITIALIZER, - .cond = PTHREAD_COND_INITIALIZER, - .mbc_wl = ff_weak_link_ref(wl) -@@ -553,14 +583,14 @@ static struct qent_base *queue_tryget_free(struct buf_pool *const bp) - return buf; - } - --static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd) -+static struct qent_base * queue_find_extract_index(struct buf_pool *const bp, const unsigned int index) - { - struct qent_base *be; - - pthread_mutex_lock(&bp->lock); - /* Expect 1st in Q, but allow anywhere */ - for (be = bp->inuse.head; be; be = be->next) { -- if (dmabuf_fd(be->dh[0]) == fd) { -+ if (be->index == index) { - bq_extract_inuse(bp, be); - break; - } -@@ -602,6 +632,8 @@ struct mediabufs_ctl { - struct pollqueue * pq; - struct ff_weak_link_master * this_wlm; - -+ enum mediabufs_memory src_memtype; -+ enum mediabufs_memory dst_memtype; - struct v4l2_format src_fmt; - struct v4l2_format dst_fmt; - struct v4l2_capability capability; -@@ -614,7 +646,7 @@ static int qe_v4l2_queue(struct qent_base *const be, - { - struct v4l2_buffer buffer = { - .type = fmt->type, -- .memory = V4L2_MEMORY_DMABUF, -+ .memory = mediabufs_memory_to_v4l2(be->memtype), - .index = be->index - }; - struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; -@@ -628,7 +660,10 @@ static int qe_v4l2_queue(struct qent_base *const be, - /* *** Really need a pixdesc rather than a format so we can fill in data_offset */ - planes[i].length = dmabuf_size(be->dh[i]); - planes[i].bytesused = dmabuf_len(be->dh[i]); -- planes[i].m.fd = dmabuf_fd(be->dh[i]); -+ if (be->memtype == MEDIABUFS_MEMORY_DMABUF) -+ planes[i].m.fd = dmabuf_fd(be->dh[i]); -+ else -+ planes[i].m.mem_offset = 0; - } - buffer.m.planes = planes; - buffer.length = i; -@@ -639,7 +674,10 @@ static int qe_v4l2_queue(struct qent_base *const be, - - buffer.bytesused = dmabuf_len(be->dh[0]); - buffer.length = dmabuf_size(be->dh[0]); -- buffer.m.fd = dmabuf_fd(be->dh[0]); -+ if (be->memtype == MEDIABUFS_MEMORY_DMABUF) -+ buffer.m.fd = dmabuf_fd(be->dh[0]); -+ else -+ buffer.m.offset = 0; - } - - if (!is_dst && mreq) { -@@ -668,14 +706,13 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp, - const int vfd, - const struct v4l2_format * const f) - { -- int fd; - struct qent_base *be; - int rc; - const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type); - struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; - struct v4l2_buffer buffer = { - .type = f->type, -- .memory = V4L2_MEMORY_DMABUF -+ .memory = mediabufs_memory_to_v4l2(bp->memtype) - }; - if (mp) { - buffer.length = f->fmt.pix_mp.num_planes; -@@ -690,10 +727,9 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp, - return NULL; - } - -- fd = mp ? planes[0].m.fd : buffer.m.fd; -- be = queue_find_extract_fd(bp, fd); -+ be = queue_find_extract_index(bp, buffer.index); - if (!be) { -- request_log("Failed to find fd %d in Q\n", fd); -+ request_log("Failed to find index %d in Q\n", buffer.index); - return NULL; - } - -@@ -1104,7 +1140,7 @@ static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, stru - - struct v4l2_create_buffers cbuf = { - .count = n, -- .memory = V4L2_MEMORY_DMABUF, -+ .memory = mediabufs_memory_to_v4l2(mbc->dst->memtype), - .format = mbc->dst_fmt, - }; - -@@ -1125,12 +1161,97 @@ static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, stru - return cbuf.count; - } - -+static MediaBufsStatus -+qe_import_from_buf(struct mediabufs_ctl *const mbc, struct qent_base * const be, const struct v4l2_format *const fmt, -+ const unsigned int n, const bool x_dmabuf) -+{ -+ struct v4l2_buffer buf = { -+ .index = n, -+ .type = fmt->type, -+ }; -+ struct v4l2_plane planes[VIDEO_MAX_PLANES]; -+ int ret; -+ -+ if (be->dh[0]) -+ return 0; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ memset(planes, 0, sizeof(planes)); -+ buf.m.planes = planes; -+ buf.length = VIDEO_MAX_PLANES; -+ } -+ -+ if ((ret = ioctl(mbc->vfd, VIDIOC_QUERYBUF, &buf)) != 0) { -+ request_err(mbc->dc, "VIDIOC_QUERYBUF failed"); -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) -+ { -+ unsigned int i; -+ for (i = 0; i != buf.length; ++i) { -+ if (x_dmabuf) { -+ struct v4l2_exportbuffer xbuf = { -+ .type = buf.type, -+ .index = buf.index, -+ .plane = i, -+ .flags = O_RDWR, // *** Arguably O_RDONLY would be fine -+ }; -+ if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0) -+ be->dh[i] = dmabuf_import(xbuf.fd, planes[i].length); -+ } -+ else { -+ be->dh[i] = dmabuf_import_mmap( -+ mmap(NULL, planes[i].length, -+ PROT_READ | PROT_WRITE, -+ MAP_SHARED | MAP_POPULATE, -+ mbc->vfd, planes[i].m.mem_offset), -+ planes[i].length); -+ } -+ /* On failure tidy up and die */ -+ if (!be->dh[i]) { -+ while (i--) { -+ dmabuf_free(be->dh[i]); -+ be->dh[i] = NULL; -+ } -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ } -+ } -+ else -+ { -+ if (x_dmabuf) { -+ struct v4l2_exportbuffer xbuf = { -+ .type = buf.type, -+ .index = buf.index, -+ .flags = O_RDWR, // *** Arguably O_RDONLY would be fine -+ }; -+ if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0) -+ be->dh[0] = dmabuf_import(xbuf.fd, buf.length); -+ } -+ else { -+ be->dh[0] = dmabuf_import_mmap( -+ mmap(NULL, buf.length, -+ PROT_READ | PROT_WRITE, -+ MAP_SHARED | MAP_POPULATE, -+ mbc->vfd, buf.m.offset), -+ buf.length); -+ } -+ /* On failure tidy up and die */ -+ if (!be->dh[0]) { -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ } -+ -+ return 0; -+} -+ - struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc) - { - struct qent_dst * be_dst; - - if (mbc == NULL) { -- be_dst = qe_dst_new(NULL); -+ be_dst = qe_dst_new(NULL, MEDIABUFS_MEMORY_DMABUF); - if (be_dst) - be_dst->base.status = QENT_IMPORT; - return be_dst; -@@ -1144,7 +1265,7 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struc - else { - be_dst = base_to_dst(queue_tryget_free(mbc->dst)); - if (!be_dst) { -- be_dst = qe_dst_new(mbc->this_wlm); -+ be_dst = qe_dst_new(mbc->this_wlm, mbc->dst->memtype); - if (!be_dst) - return NULL; - -@@ -1155,12 +1276,21 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struc - } - } - -- if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) { -- /* Given how create buf works we can't uncreate it on alloc failure -- * all we can do is put it on the free Q -- */ -- queue_put_free(mbc->dst, &be_dst->base); -- return NULL; -+ if (mbc->dst->memtype == MEDIABUFS_MEMORY_MMAP) { -+ if (qe_import_from_buf(mbc, &be_dst->base, &mbc->dst_fmt, be_dst->base.index, true)) { -+ request_err(mbc->dc, "Failed to export as dmabuf\n"); -+ queue_put_free(mbc->dst, &be_dst->base); -+ return NULL; -+ } -+ } -+ else { -+ if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) { -+ /* Given how create buf works we can't uncreate it on alloc failure -+ * all we can do is put it on the free Q -+ */ -+ queue_put_free(mbc->dst, &be_dst->base); -+ return NULL; -+ } - } - - be_dst->base.status = QENT_PENDING; -@@ -1208,7 +1338,7 @@ MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc, - - // ** This is a mess if we get partial alloc but without any way to remove - // individual V4L2 Q members we are somewhat stuffed --MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed) -+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype) - { - unsigned int i; - int a = 0; -@@ -1218,10 +1348,12 @@ MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, cons - if (n > 32) - return MEDIABUFS_ERROR_ALLOCATION_FAILED; - -+ mbc->dst->memtype = memtype; -+ - // Create qents first as it is hard to get rid of the V4L2 buffers on error - for (qc = 0; qc != n; ++qc) - { -- if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL) -+ if ((qes[qc] = qe_dst_new(mbc->this_wlm, mbc->dst->memtype)) == NULL) - goto fail; - } - -@@ -1260,19 +1392,61 @@ void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src * - queue_put_free(mbc->src, &qe_src->base); - } - -+static MediaBufsStatus -+chk_memory_type(struct mediabufs_ctl *const mbc, -+ const struct v4l2_format * const f, -+ const enum mediabufs_memory m) -+{ -+ struct v4l2_create_buffers cbuf = { -+ .count = 0, -+ .memory = V4L2_MEMORY_MMAP, -+ .format = *f -+ }; -+ -+ if (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf) != 0) -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ -+ switch (m) { -+ case MEDIABUFS_MEMORY_DMABUF: -+ // 0 = Unknown but assume not in that case -+ if ((cbuf.capabilities & V4L2_BUF_CAP_SUPPORTS_DMABUF) == 0) -+ return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY; -+ break; -+ case MEDIABUFS_MEMORY_MMAP: -+ break; -+ default: -+ return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY; -+ } -+ -+ return MEDIABUFS_STATUS_SUCCESS; -+} -+ -+MediaBufsStatus -+mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype) -+{ -+ return chk_memory_type(mbc, &mbc->src_fmt, memtype); -+} -+ -+MediaBufsStatus -+mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype) -+{ -+ return chk_memory_type(mbc, &mbc->dst_fmt, memtype); -+} -+ - /* src format must have been set up before this */ - MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc, - struct dmabufs_ctl * const dbsc, -- unsigned int n) -+ unsigned int n, const enum mediabufs_memory memtype) - { - unsigned int i; - struct v4l2_requestbuffers req = { - .count = n, - .type = mbc->src_fmt.type, -- .memory = V4L2_MEMORY_DMABUF -+ .memory = mediabufs_memory_to_v4l2(memtype) - }; - - bq_free_all_free_src(mbc->src); -+ - while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) { - if (errno != EINTR) { - request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__); -@@ -1286,21 +1460,36 @@ MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc, - } - - for (i = 0; i != n; ++i) { -- struct qent_src *const be_src = qe_src_new(); -+ struct qent_src *const be_src = qe_src_new(memtype); - if (!be_src) { - request_err(mbc->dc, "Failed to create src be %d\n", i); - goto fail; - } -- if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) { -- qe_src_free(be_src); -+ switch (memtype) { -+ case MEDIABUFS_MEMORY_MMAP: -+ if (qe_import_from_buf(mbc, &be_src->base, &mbc->src_fmt, i, false)) { -+ qe_src_free(be_src); -+ goto fail; -+ } -+ be_src->fixed_size = 1; -+ break; -+ case MEDIABUFS_MEMORY_DMABUF: -+ if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) { -+ qe_src_free(be_src); -+ goto fail; -+ } -+ be_src->fixed_size = !mediabufs_src_resizable(mbc); -+ break; -+ default: -+ request_err(mbc->dc, "Unexpected memorty type\n"); - goto fail; - } - be_src->base.index = i; -- be_src->fixed_size = !mediabufs_src_resizable(mbc); - - queue_put_free(mbc->src, &be_src->base); - } - -+ mbc->src->memtype = memtype; - return MEDIABUFS_STATUS_SUCCESS; - - fail: -@@ -1437,9 +1626,13 @@ int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ - - int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc) - { -+#if 1 -+ return 0; -+#else - // Single planar OUTPUT can only take exact size buffers - // Multiplanar will take larger than negotiated - return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type); -+#endif - } - - static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc) -diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h -index 0307a831de..890947b2e2 100644 ---- a/libavcodec/v4l2_req_media.h -+++ b/libavcodec/v4l2_req_media.h -@@ -43,6 +43,7 @@ typedef enum media_buf_status { - MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE, - MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT, - MEDIABUFS_ERROR_ALLOCATION_FAILED, -+ MEDIABUFS_ERROR_UNSUPPORTED_MEMORY, - } MediaBufsStatus; - - struct media_pool * media_pool_new(const char * const media_path, -@@ -70,6 +71,15 @@ struct qent_dst; - struct dmabuf_h; - struct dmabufs_ctl; - -+// 1-1 mammping to V4L2 type - just defined separetely to avoid some include versioning difficulties -+enum mediabufs_memory { -+ MEDIABUFS_MEMORY_UNSET = 0, -+ MEDIABUFS_MEMORY_MMAP = 1, -+ MEDIABUFS_MEMORY_USERPTR = 2, -+ MEDIABUFS_MEMORY_OVERLAY = 3, -+ MEDIABUFS_MEMORY_DMABUF = 4, -+}; -+ - int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp); - struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst); - -@@ -93,6 +103,8 @@ MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst, - unsigned int plane, - int fd, size_t size); - -+const char * mediabufs_memory_name(const enum mediabufs_memory m); -+ - MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc, - struct media_request **const pmreq, - struct qent_src **const psrc_be, -@@ -106,7 +118,7 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, - // Create dst slots without alloc - // If fixed true then qent_alloc will only get slots from this pool and will - // block until a qent has been unrefed --MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed); -+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype); - - MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc); - MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc); -@@ -140,7 +152,12 @@ MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc, - - MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw, - struct dmabufs_ctl * const dbsc, -- unsigned int n); -+ unsigned int n, -+ const enum mediabufs_memory memtype); -+ -+// Want to have appropriate formats set first -+MediaBufsStatus mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype); -+MediaBufsStatus mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype); - - #define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c)) - unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc); -diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c -index cd79aad563..5cf17dd5e3 100644 ---- a/libavcodec/v4l2_request_hevc.c -+++ b/libavcodec/v4l2_request_hevc.c -@@ -144,6 +144,8 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) - const struct decdev * decdev; - const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2; // Assuming constant for all APIs but avoiding V4L2 includes - size_t src_size; -+ enum mediabufs_memory src_memtype; -+ enum mediabufs_memory dst_memtype; - - av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); - -@@ -174,8 +176,14 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) - decdev_media_path(decdev), decdev_video_path(decdev)); - - if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) { -- av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n"); -- goto fail0; -+ av_log(avctx, AV_LOG_DEBUG, "Unable to open dmabufs - try mmap buffers\n"); -+ src_memtype = MEDIABUFS_MEMORY_MMAP; -+ dst_memtype = MEDIABUFS_MEMORY_MMAP; -+ } -+ else { -+ av_log(avctx, AV_LOG_DEBUG, "Dmabufs opened - try dmabuf buffers\n"); -+ src_memtype = MEDIABUFS_MEMORY_DMABUF; -+ dst_memtype = MEDIABUFS_MEMORY_DMABUF; - } - - if ((ctx->pq = pollqueue_new()) == NULL) { -@@ -196,8 +204,9 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) - // Ask for an initial bitbuf size of max size / 4 - // We will realloc if we need more - // Must use sps->h/w as avctx contains cropped size -+retry_src_memtype: - src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8); -- if (mediabufs_src_resizable(ctx->mbufs)) -+ if (src_memtype == MEDIABUFS_MEMORY_DMABUF && mediabufs_src_resizable(ctx->mbufs)) - src_size /= 4; - // Kludge for conformance tests which break Annex A limits - else if (src_size < 0x40000) -@@ -210,6 +219,15 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) - goto fail4; - } - -+ if (mediabufs_src_chk_memtype(ctx->mbufs, src_memtype)) { -+ if (src_memtype == MEDIABUFS_MEMORY_DMABUF) { -+ src_memtype = MEDIABUFS_MEMORY_MMAP; -+ goto retry_src_memtype; -+ } -+ av_log(avctx, AV_LOG_ERROR, "Failed to get src memory type\n"); -+ goto fail4; -+ } -+ - if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) { - av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n"); - ctx->fns = &V2(ff_v4l2_req_hevc, 4); -@@ -238,7 +256,7 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) - goto fail4; - } - -- if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) { -+ if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6, src_memtype)) { - av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n"); - goto fail4; - } -@@ -250,8 +268,17 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) - sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering, - avctx->thread_count, avctx->extra_hw_frames); - -+ if (mediabufs_dst_chk_memtype(ctx->mbufs, dst_memtype)) { -+ if (dst_memtype != MEDIABUFS_MEMORY_DMABUF) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to get dst memory type\n"); -+ goto fail4; -+ } -+ av_log(avctx, AV_LOG_DEBUG, "Dst DMABUF not supported - trying mmap\n"); -+ dst_memtype = MEDIABUFS_MEMORY_MMAP; -+ } -+ - // extra_hw_frames is -1 if unset -- if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) { -+ if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0), dst_memtype)) { - av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n"); - goto fail4; - } -@@ -277,9 +304,10 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) - // Set our s/w format - avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format; - -- av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n", -+ av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s\n", - ctx->fns->name, -- decdev_media_path(decdev), decdev_video_path(decdev)); -+ decdev_media_path(decdev), decdev_video_path(decdev), -+ mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype)); - - return 0; - - -From 79c2fcac56586ce9eea0cc8c6b13d2cd54f3e468 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 22 Aug 2022 12:35:40 +0000 -Subject: [PATCH 064/136] Set buffer lengths on DQ - ---- - libavcodec/v4l2_req_media.c | 8 ++++++++ - 1 file changed, 8 insertions(+) - -diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c -index 910ac77bb6..1a9944774a 100644 ---- a/libavcodec/v4l2_req_media.c -+++ b/libavcodec/v4l2_req_media.c -@@ -733,6 +733,14 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp, - return NULL; - } - -+ if (mp) { -+ unsigned int i; -+ for (i = 0; i != buffer.length; ++i) -+ dmabuf_len_set(be->dh[i], V4L2_TYPE_IS_CAPTURE(f->type) ? planes[i].bytesused : 0); -+ } -+ else -+ dmabuf_len_set(be->dh[0], V4L2_TYPE_IS_CAPTURE(f->type) ? buffer.length : 0); -+ - be->timestamp = buffer.timestamp; - be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE; - return be; - -From 8f3245ca1e4b2ec7e13fc2f3bffbc964ee8fc290 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 22 Aug 2022 17:11:24 +0000 -Subject: [PATCH 065/136] Fix compile if videodev2.h defines V4L2 HEVC request - API - -If videodev2.h does define the HEVC request API it is really hard to -set old variations of the controls so if it does then we only compile -against the system includes and remove the back compatability. ---- - configure | 9 +++++++++ - libavcodec/Makefile | 4 ++-- - libavcodec/hevc-ctrls-v4.h | 2 ++ - libavcodec/v4l2_req_hevc_vx.c | 5 ----- - libavcodec/v4l2_request_hevc.c | 6 ++++-- - 5 files changed, 17 insertions(+), 9 deletions(-) - -diff --git a/configure b/configure -index fdc95146bf..5c00a183e3 100755 ---- a/configure -+++ b/configure -@@ -1946,6 +1946,7 @@ FEATURE_LIST=" - swscale_alpha - vout_drm - vout_egl -+ v4l2_req_hevc_vx - " - - # this list should be kept in linking order -@@ -6912,6 +6913,14 @@ fi - - check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns - check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;" -+disable v4l2_req_hevc_vx -+if enabled hevc_v4l2request_hwaccel; then -+ enable v4l2_req_hevc_vx -+fi -+if enabled hevc_v4l2_request; then -+ disable v4l2_req_hevc_vx -+fi -+ - check_headers sys/videoio.h - test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete - -diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index d433a71236..11f183c9b9 100644 ---- a/libavcodec/Makefile -+++ b/libavcodec/Makefile -@@ -999,8 +999,8 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL) += dxva2_hevc.o - OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o - OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o - OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec.o --OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o\ -- v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o v4l2_req_hevc_v4.o -+OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o v4l2_req_hevc_v4.o -+OBJS-$(CONFIG_V4L2_REQ_HEVC_VX) += v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o - OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o - OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o h265_profile_level.o - OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o -diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h -index 7e05f6e7c3..7829d82084 100644 ---- a/libavcodec/hevc-ctrls-v4.h -+++ b/libavcodec/hevc-ctrls-v4.h -@@ -53,6 +53,8 @@ - #include - #include - -+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ -+ - #define V4L2_CID_STATELESS_HEVC_SPS (V4L2_CID_CODEC_STATELESS_BASE + 400) - #define V4L2_CID_STATELESS_HEVC_PPS (V4L2_CID_CODEC_STATELESS_BASE + 401) - #define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 402) -diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c -index 5d083016f8..e1bd5c6a1f 100644 ---- a/libavcodec/v4l2_req_hevc_vx.c -+++ b/libavcodec/v4l2_req_hevc_vx.c -@@ -40,11 +40,6 @@ - #define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B - #endif - --// Should be in videodev2 but we might not have a good enough one --#ifndef V4L2_PIX_FMT_HEVC_SLICE --#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ --#endif -- - #include "v4l2_request_hevc.h" - - #include "libavutil/hwcontext_drm.h" -diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c -index 5cf17dd5e3..614a1b4d99 100644 ---- a/libavcodec/v4l2_request_hevc.c -+++ b/libavcodec/v4l2_request_hevc.c -@@ -17,7 +17,7 @@ - */ - - -- -+#include "config.h" - #include "decode.h" - #include "hevcdec.h" - #include "hwconfig.h" -@@ -142,7 +142,7 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) - const HEVCSPS * const sps = h->ps.sps; - int ret; - const struct decdev * decdev; -- const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2; // Assuming constant for all APIs but avoiding V4L2 includes -+ const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 4).src_pix_fmt_v4l2; // Assuming constant for all APIs but avoiding V4L2 includes - size_t src_size; - enum mediabufs_memory src_memtype; - enum mediabufs_memory dst_memtype; -@@ -232,6 +232,7 @@ retry_src_memtype: - av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n"); - ctx->fns = &V2(ff_v4l2_req_hevc, 4); - } -+#if CONFIG_V4L2_REQ_HEVC_VX - else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) { - av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n"); - ctx->fns = &V2(ff_v4l2_req_hevc, 3); -@@ -244,6 +245,7 @@ retry_src_memtype: - av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n"); - ctx->fns = &V2(ff_v4l2_req_hevc, 1); - } -+#endif - else { - av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n"); - ret = AVERROR(EINVAL); - -From 35ec6af32c4f05b076f84ab343a8fc0d3263ba44 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 12 Sep 2022 17:59:22 +0100 -Subject: [PATCH 066/136] v4l2_m2m_enc: Send headers in in pkt side_data - -If GLOBAL_HEADERS are requested then we can't provide them at init time -so send as NEW_EXTRADATA side data in a similar way to some AV1 -encoders. ---- - libavcodec/v4l2_m2m_enc.c | 33 +++++++++++++++++++++++---------- - 1 file changed, 23 insertions(+), 10 deletions(-) - -diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c -index 05ff6ba726..099ad23928 100644 ---- a/libavcodec/v4l2_m2m_enc.c -+++ b/libavcodec/v4l2_m2m_enc.c -@@ -544,14 +544,12 @@ dequeue: - av_freep(&avctx->extradata); - avctx->extradata_size = 0; - -- if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL) -- memcpy(data, avpkt->data, len); -+ if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL) -+ goto fail_no_mem; - -+ memcpy(data, avpkt->data, len); - av_packet_unref(avpkt); - -- if (data == NULL) -- return AVERROR(ENOMEM); -- - // We need to copy the header, but keep local if not global - if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) { - avctx->extradata = data; -@@ -567,18 +565,28 @@ dequeue: - } - - // First frame must be key so mark as such even if encoder forgot -- if (capture->first_buf == 2) -+ if (capture->first_buf == 2) { - avpkt->flags |= AV_PKT_FLAG_KEY; - -+ // Add any extradata to the 1st packet we emit as we cannot create it at init -+ if (avctx->extradata_size > 0 && avctx->extradata) { -+ void * const side = av_packet_new_side_data(avpkt, -+ AV_PKT_DATA_NEW_EXTRADATA, -+ avctx->extradata_size); -+ if (!side) -+ goto fail_no_mem; -+ -+ memcpy(side, avctx->extradata, avctx->extradata_size); -+ } -+ } -+ - // Add SPS/PPS to the start of every key frame if non-global headers - if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) { - const size_t newlen = s->extdata_size + avpkt->size; - AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE); - -- if (buf == NULL) { -- av_packet_unref(avpkt); -- return AVERROR(ENOMEM); -- } -+ if (buf == NULL) -+ goto fail_no_mem; - - memcpy(buf->data, s->extdata_data, s->extdata_size); - memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size); -@@ -592,6 +600,11 @@ dequeue: - // av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret); - capture->first_buf = 0; - return 0; -+ -+fail_no_mem: -+ ret = AVERROR(ENOMEM); -+ av_packet_unref(avpkt); -+ return ret; - } - - static av_cold int v4l2_encode_init(AVCodecContext *avctx) - -From dfc754491cea9192945b92ca9c8d3919321e30ad Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 14 Sep 2022 15:44:10 +0000 -Subject: [PATCH 067/136] matroskaenc: Allow H264 SPS/PPS headers in packet - sidedata - ---- - libavformat/matroskaenc.c | 26 ++++++++++++++++++++++---- - 1 file changed, 22 insertions(+), 4 deletions(-) - -diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c -index 113541bd9a..61e4c976ef 100644 ---- a/libavformat/matroskaenc.c -+++ b/libavformat/matroskaenc.c -@@ -77,6 +77,10 @@ - - #define IS_WEBM(mkv) (CONFIG_WEBM_MUXER && CONFIG_MATROSKA_MUXER ? \ - ((mkv)->mode == MODE_WEBM) : CONFIG_WEBM_MUXER) -+ -+/* Reserved size for H264 headers if not extant at init time */ -+#define MAX_H264_HEADER_SIZE 1024 -+ - #define IS_SEEKABLE(pb, mkv) (((pb)->seekable & AVIO_SEEKABLE_NORMAL) && \ - !(mkv)->is_live) - -@@ -1121,8 +1125,12 @@ static int mkv_assemble_native_codecprivate(AVFormatContext *s, AVIOContext *dyn - case AV_CODEC_ID_WAVPACK: - return put_wv_codecpriv(dyn_cp, extradata, extradata_size); - case AV_CODEC_ID_H264: -- return ff_isom_write_avcc(dyn_cp, extradata, -- extradata_size); -+ if (par->extradata_size) -+ return ff_isom_write_avcc(dyn_cp, extradata, -+ extradata_size); -+ else -+ *size_to_reserve = MAX_H264_HEADER_SIZE; -+ break; - case AV_CODEC_ID_HEVC: - return ff_isom_write_hvcc(dyn_cp, extradata, - extradata_size, 0); -@@ -2731,8 +2739,8 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt) - } - break; - #endif -- // FIXME: Remove the following once libaom starts propagating proper extradata during init() -- // See https://bugs.chromium.org/p/aomedia/issues/detail?id=2208 -+ // FIXME: Remove the following once libaom starts propagating extradata during init() -+ // See https://bugs.chromium.org/p/aomedia/issues/detail?id=2012 - case AV_CODEC_ID_AV1: - if (side_data_size && mkv->track.bc && !par->extradata_size) { - // If the reserved space doesn't suffice, only write -@@ -2744,6 +2752,16 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt) - } else if (!par->extradata_size) - return AVERROR_INVALIDDATA; - break; -+ // H264 V4L2 has a similar issue -+ case AV_CODEC_ID_H264: -+ if (side_data_size && mkv->track.bc && !par->extradata_size) { -+ ret = mkv_update_codecprivate(s, mkv, side_data, side_data_size, -+ par, mkv->track.bc, track, 0); -+ if (ret < 0) -+ return ret; -+ } else if (!par->extradata_size) -+ return AVERROR_INVALIDDATA; -+ break; - default: - if (side_data_size) - av_log(s, AV_LOG_DEBUG, "Ignoring new extradata in a packet for stream %d.\n", pkt->stream_index); - -From 30c6ca4e24ae2acbd7f7f122f5275beb62b625c6 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 14 Sep 2022 15:55:15 +0000 -Subject: [PATCH 068/136] movenc: Allow H264 SPS/PPS headers in packet sidedata - ---- - libavformat/movenc.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/libavformat/movenc.c b/libavformat/movenc.c -index c4fcb5f8b1..891adbf7b2 100644 ---- a/libavformat/movenc.c -+++ b/libavformat/movenc.c -@@ -6343,6 +6343,7 @@ static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt) - if (trk->par->codec_id == AV_CODEC_ID_MP4ALS || - trk->par->codec_id == AV_CODEC_ID_AAC || - trk->par->codec_id == AV_CODEC_ID_AV1 || -+ trk->par->codec_id == AV_CODEC_ID_H264 || - trk->par->codec_id == AV_CODEC_ID_FLAC) { - size_t side_size; - uint8_t *side = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size); - -From 1c7c3e99e9ed90f241aecbe7b2269229587d1e03 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 26 Sep 2022 12:45:05 +0100 -Subject: [PATCH 069/136] Allow ffmpeg to select codec internal hwfmts if - no_cvt_hw - -This allows the selection of DRM_PRIME from v4l2m2m without forcing it -in the decoder. - -Not utterly sure this is the right method for 5.1 but it does work ---- - fftools/ffmpeg.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c -index ba0c1898cf..839da7b472 100644 ---- a/fftools/ffmpeg.c -+++ b/fftools/ffmpeg.c -@@ -2763,12 +2763,15 @@ static enum AVPixelFormat get_format(AVCodecContext *s, const enum AVPixelFormat - break; - - if (ist->hwaccel_id == HWACCEL_GENERIC || -- ist->hwaccel_id == HWACCEL_AUTO) { -+ ist->hwaccel_id == HWACCEL_AUTO || -+ no_cvt_hw) { - for (i = 0;; i++) { - config = avcodec_get_hw_config(s->codec, i); - if (!config) - break; -- if (!(config->methods & -+ if (no_cvt_hw && (config->methods & AV_CODEC_HW_CONFIG_METHOD_INTERNAL)) -+ av_log(s, AV_LOG_DEBUG, "no_cvt_hw so trying pix_fmt %d with codec internal hwaccel\n", *p); -+ else if (!(config->methods & - AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX)) - continue; - if (config->pix_fmt == *p) - -From ecf273fd02e8aafe8775b1f291b9664b1b49572e Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 1 Sep 2022 11:42:41 +0000 -Subject: [PATCH 070/136] vf_deinterlace_v4l2m2m: Add a v4l2m2m scaler - -The logic for running an isp based scaler is pretty much identical to -that for the deinterlacer so add to the deinterlacer. This requires -some rework of the setup code to avoid assumptions that are true for -deinterlace but not scale but the reworked code requires few switches -based on operation. ---- - libavfilter/allfilters.c | 1 + - libavfilter/vf_deinterlace_v4l2m2m.c | 1123 ++++++++++++++++++++------ - 2 files changed, 877 insertions(+), 247 deletions(-) - -diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c -index 357ff61ca8..d504fa1bc8 100644 ---- a/libavfilter/allfilters.c -+++ b/libavfilter/allfilters.c -@@ -421,6 +421,7 @@ extern const AVFilter ff_vf_scale; - extern const AVFilter ff_vf_scale_cuda; - extern const AVFilter ff_vf_scale_npp; - extern const AVFilter ff_vf_scale_qsv; -+extern const AVFilter ff_vf_scale_v4l2m2m; - extern const AVFilter ff_vf_scale_vaapi; - extern const AVFilter ff_vf_scale_vulkan; - extern const AVFilter ff_vf_scale2ref; -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -index 1a3bef5bcb..2df39ec0f1 100644 ---- a/libavfilter/vf_deinterlace_v4l2m2m.c -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -52,31 +52,36 @@ - #include "avfilter.h" - #include "formats.h" - #include "internal.h" -+#include "scale_eval.h" - #include "video.h" - -+#ifndef DRM_FORMAT_P030 -+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */ -+#endif -+ - typedef struct V4L2Queue V4L2Queue; - typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared; - --typedef struct V4L2PlaneInfo { -- int bytesperline; -- size_t length; --} V4L2PlaneInfo; -+typedef enum filter_type_v4l2_e -+{ -+ FILTER_V4L2_DEINTERLACE = 1, -+ FILTER_V4L2_SCALE, -+} filter_type_v4l2_t; - - typedef struct V4L2Buffer { - int enqueued; - int reenqueue; -- int fd; - struct v4l2_buffer buffer; - AVFrame frame; - struct v4l2_plane planes[VIDEO_MAX_PLANES]; - int num_planes; -- V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES]; - AVDRMFrameDescriptor drm_frame; - V4L2Queue *q; - } V4L2Buffer; - - typedef struct V4L2Queue { - struct v4l2_format format; -+ struct v4l2_selection sel; - int num_buffers; - V4L2Buffer *buffers; - DeintV4L2M2MContextShared *ctx; -@@ -111,11 +116,18 @@ typedef struct pts_track_s - - typedef struct DeintV4L2M2MContextShared { - void * logctx; // For logging - will be NULL when done -+ filter_type_v4l2_t filter_type; - - int fd; - int done; - int width; - int height; -+ -+ // from options -+ int output_width; -+ int output_height; -+ enum AVPixelFormat output_format; -+ - int orig_width; - int orig_height; - atomic_uint refcount; -@@ -134,8 +146,60 @@ typedef struct DeintV4L2M2MContext { - const AVClass *class; - - DeintV4L2M2MContextShared *shared; -+ -+ char * w_expr; -+ char * h_expr; -+ char * output_format_string;; -+ -+ int force_original_aspect_ratio; -+ int force_divisible_by; -+ -+ char *colour_primaries_string; -+ char *colour_transfer_string; -+ char *colour_matrix_string; -+ int colour_range; -+ char *chroma_location_string; -+ -+ enum AVColorPrimaries colour_primaries; -+ enum AVColorTransferCharacteristic colour_transfer; -+ enum AVColorSpace colour_matrix; -+ enum AVChromaLocation chroma_location; - } DeintV4L2M2MContext; - -+// These just list the ones we know we can cope with -+static uint32_t -+fmt_av_to_v4l2(const enum AVPixelFormat avfmt) -+{ -+ switch (avfmt) { -+ case AV_PIX_FMT_YUV420P: -+ return V4L2_PIX_FMT_YUV420; -+ case AV_PIX_FMT_NV12: -+ return V4L2_PIX_FMT_NV12; -+ case AV_PIX_FMT_RPI4_8: -+ case AV_PIX_FMT_SAND128: -+ return V4L2_PIX_FMT_NV12_COL128; -+ default: -+ break; -+ } -+ return 0; -+} -+ -+static enum AVPixelFormat -+fmt_v4l2_to_av(const uint32_t pixfmt) -+{ -+ switch (pixfmt) { -+ case V4L2_PIX_FMT_YUV420: -+ return AV_PIX_FMT_YUV420P; -+ case V4L2_PIX_FMT_NV12: -+ return AV_PIX_FMT_NV12; -+ case V4L2_PIX_FMT_NV12_COL128: -+ return AV_PIX_FMT_RPI4_8; -+ default: -+ break; -+ } -+ return AV_PIX_FMT_NONE; -+} -+ - static unsigned int pts_stats_interval(const pts_stats_t * const stats) - { - return stats->last_interval; -@@ -301,6 +365,39 @@ static int pts_track_init(pts_track_t * const trk, void *logctx) - return 0; - } - -+static inline uint32_t -+fmt_bpl(const struct v4l2_format * const fmt, const unsigned int plane_n) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.plane_fmt[plane_n].bytesperline : fmt->fmt.pix.bytesperline; -+} -+ -+static inline uint32_t -+fmt_height(const struct v4l2_format * const fmt) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; -+} -+ -+static inline uint32_t -+fmt_width(const struct v4l2_format * const fmt) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; -+} -+ -+static inline uint32_t -+fmt_pixelformat(const struct v4l2_format * const fmt) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat; -+} -+ -+static void -+init_format(V4L2Queue * const q, const uint32_t format_type) -+{ -+ memset(&q->format, 0, sizeof(q->format)); -+ memset(&q->sel, 0, sizeof(q->sel)); -+ q->format.type = format_type; -+ q->sel.type = format_type; -+} -+ - static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) - { - struct v4l2_capability cap; -@@ -311,80 +408,99 @@ static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) - if (ret < 0) - return ret; - -- if (!(cap.capabilities & V4L2_CAP_STREAMING)) -+ if (ctx->filter_type == FILTER_V4L2_SCALE && -+ strcmp("bcm2835-codec-isp", cap.card) != 0) -+ { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Not ISP\n"); - return AVERROR(EINVAL); -+ } - -- if (cap.capabilities & V4L2_CAP_VIDEO_M2M) { -- ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; -- ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; -- -- return 0; -+ if (!(cap.capabilities & V4L2_CAP_STREAMING)) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "No streaming\n"); -+ return AVERROR(EINVAL); - } - - if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) { -- ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; -- ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; -- -- return 0; -+ init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE); -+ init_format(&ctx->output, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE); -+ } -+ else if (cap.capabilities & V4L2_CAP_VIDEO_M2M) { -+ init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE); -+ init_format(&ctx->output, V4L2_BUF_TYPE_VIDEO_OUTPUT); -+ } -+ else { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Not M2M\n"); -+ return AVERROR(EINVAL); - } - -- return AVERROR(EINVAL); -+ return 0; - } - --static int deint_v4l2m2m_try_format(V4L2Queue *queue) -+// Just use for probe - doesn't modify q format -+static int deint_v4l2m2m_try_format(V4L2Queue *queue, const uint32_t width, const uint32_t height, const enum AVPixelFormat avfmt) - { -- struct v4l2_format *fmt = &queue->format; -+ struct v4l2_format fmt = {.type = queue->format.type}; - DeintV4L2M2MContextShared *ctx = queue->ctx; - int ret, field; -+ // Pick YUV to test with if not otherwise specified -+ uint32_t pixelformat = avfmt == AV_PIX_FMT_NONE ? V4L2_PIX_FMT_YUV420 : fmt_av_to_v4l2(avfmt); -+ enum AVPixelFormat r_avfmt; -+ - -- ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt); -+ ret = ioctl(ctx->fd, VIDIOC_G_FMT, &fmt); - if (ret) - av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret); - -- if (V4L2_TYPE_IS_OUTPUT(fmt->type)) -+ if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && V4L2_TYPE_IS_OUTPUT(fmt.type)) - field = V4L2_FIELD_INTERLACED_TB; - else - field = V4L2_FIELD_NONE; - -- if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -- fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420; -- fmt->fmt.pix_mp.field = field; -- fmt->fmt.pix_mp.width = ctx->width; -- fmt->fmt.pix_mp.height = ctx->height; -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) { -+ fmt.fmt.pix_mp.pixelformat = pixelformat; -+ fmt.fmt.pix_mp.field = field; -+ fmt.fmt.pix_mp.width = width; -+ fmt.fmt.pix_mp.height = height; - } else { -- fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420; -- fmt->fmt.pix.field = field; -- fmt->fmt.pix.width = ctx->width; -- fmt->fmt.pix.height = ctx->height; -+ fmt.fmt.pix.pixelformat = pixelformat; -+ fmt.fmt.pix.field = field; -+ fmt.fmt.pix.width = width; -+ fmt.fmt.pix.height = height; - } - -- av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__, -- fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height, -- fmt->fmt.pix_mp.pixelformat, -- fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline); -+ av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__, -+ fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height, -+ fmt.fmt.pix_mp.pixelformat, -+ fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline); - -- ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt); -+ ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, &fmt); - if (ret) - return AVERROR(EINVAL); - -- av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__, -- fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height, -- fmt->fmt.pix_mp.pixelformat, -- fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline); -+ av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__, -+ fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height, -+ fmt.fmt.pix_mp.pixelformat, -+ fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline); - -- if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -- if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 && -- fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) || -- fmt->fmt.pix_mp.field != field) { -- av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); -+ r_avfmt = fmt_v4l2_to_av(fmt_pixelformat(&fmt)); -+ if (r_avfmt != avfmt && avfmt != AV_PIX_FMT_NONE) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Unable to set format %s on %s port\n", av_get_pix_fmt_name(avfmt), V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src"); -+ return AVERROR(EINVAL); -+ } -+ if (r_avfmt == AV_PIX_FMT_NONE) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "No supported format on %s port\n", V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src"); -+ return AVERROR(EINVAL); -+ } -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) { -+ if (fmt.fmt.pix_mp.field != field) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type); - - return AVERROR(EINVAL); - } - } else { -- if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 && -- fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) || -- fmt->fmt.pix.field != field) { -- av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); -+ if (fmt.fmt.pix.field != field) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type); - - return AVERROR(EINVAL); - } -@@ -393,68 +509,410 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue) - return 0; - } - --static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize) -+static int -+do_s_fmt(V4L2Queue * const q) - { -- struct v4l2_format *fmt = &queue->format; -- DeintV4L2M2MContextShared *ctx = queue->ctx; -+ DeintV4L2M2MContextShared * const ctx = q->ctx; -+ const uint32_t pixelformat = fmt_pixelformat(&q->format); - int ret; - -- struct v4l2_selection sel = { -- .type = fmt->type, -- .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS, -- }; -- -- // This works for most single object 4:2:0 types -- if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -- fmt->fmt.pix_mp.pixelformat = pixelformat; -- fmt->fmt.pix_mp.field = field; -- fmt->fmt.pix_mp.width = width; -- fmt->fmt.pix_mp.height = ysize / pitch; -- fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch; -- fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1); -- } else { -- fmt->fmt.pix.pixelformat = pixelformat; -- fmt->fmt.pix.field = field; -- fmt->fmt.pix.width = width; -- fmt->fmt.pix.height = height; -- fmt->fmt.pix.sizeimage = 0; -- fmt->fmt.pix.bytesperline = 0; -- } -- -- ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt); -+ ret = ioctl(ctx->fd, VIDIOC_S_FMT, &q->format); - if (ret) { - ret = AVERROR(errno); -- av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret); -+ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %s\n", av_err2str(ret)); - return ret; - } - -- if (pixelformat != fmt->fmt.pix.pixelformat) { -- av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat)); -+ if (pixelformat != fmt_pixelformat(&q->format)) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt_pixelformat(&q->format))); - return AVERROR(EINVAL); - } - -- ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel); -+ q->sel.target = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE, -+ q->sel.flags = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_FLAG_LE : V4L2_SEL_FLAG_GE; -+ -+ ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &q->sel); - if (ret) { - ret = AVERROR(errno); -- av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret); -+ av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %s\n", av_err2str(ret)); - } - -- sel.r.width = width; -- sel.r.height = height; -- sel.r.left = 0; -- sel.r.top = 0; -- sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE, -- sel.flags = V4L2_SEL_FLAG_LE; -+ return 0; -+} - -- ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel); -- if (ret) { -- ret = AVERROR(errno); -- av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret); -+static void -+set_fmt_color(struct v4l2_format *const fmt, -+ const enum AVColorPrimaries avcp, -+ const enum AVColorSpace avcs, -+ const enum AVColorTransferCharacteristic avxc) -+{ -+ enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT; -+ enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT; -+ enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT; -+ -+ switch (avcp) { -+ case AVCOL_PRI_BT709: -+ cs = V4L2_COLORSPACE_REC709; -+ ycbcr = V4L2_YCBCR_ENC_709; -+ break; -+ case AVCOL_PRI_BT470M: -+ cs = V4L2_COLORSPACE_470_SYSTEM_M; -+ ycbcr = V4L2_YCBCR_ENC_601; -+ break; -+ case AVCOL_PRI_BT470BG: -+ cs = V4L2_COLORSPACE_470_SYSTEM_BG; -+ break; -+ case AVCOL_PRI_SMPTE170M: -+ cs = V4L2_COLORSPACE_SMPTE170M; -+ break; -+ case AVCOL_PRI_SMPTE240M: -+ cs = V4L2_COLORSPACE_SMPTE240M; -+ break; -+ case AVCOL_PRI_BT2020: -+ cs = V4L2_COLORSPACE_BT2020; -+ break; -+ case AVCOL_PRI_SMPTE428: -+ case AVCOL_PRI_SMPTE431: -+ case AVCOL_PRI_SMPTE432: -+ case AVCOL_PRI_EBU3213: -+ case AVCOL_PRI_RESERVED: -+ case AVCOL_PRI_FILM: -+ case AVCOL_PRI_UNSPECIFIED: -+ default: -+ break; -+ } -+ -+ switch (avcs) { -+ case AVCOL_SPC_RGB: -+ cs = V4L2_COLORSPACE_SRGB; -+ break; -+ case AVCOL_SPC_BT709: -+ cs = V4L2_COLORSPACE_REC709; -+ break; -+ case AVCOL_SPC_FCC: -+ cs = V4L2_COLORSPACE_470_SYSTEM_M; -+ break; -+ case AVCOL_SPC_BT470BG: -+ cs = V4L2_COLORSPACE_470_SYSTEM_BG; -+ break; -+ case AVCOL_SPC_SMPTE170M: -+ cs = V4L2_COLORSPACE_SMPTE170M; -+ break; -+ case AVCOL_SPC_SMPTE240M: -+ cs = V4L2_COLORSPACE_SMPTE240M; -+ break; -+ case AVCOL_SPC_BT2020_CL: -+ cs = V4L2_COLORSPACE_BT2020; -+ ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM; -+ break; -+ case AVCOL_SPC_BT2020_NCL: -+ cs = V4L2_COLORSPACE_BT2020; -+ break; -+ default: -+ break; -+ } -+ -+ switch (xfer) { -+ case AVCOL_TRC_BT709: -+ xfer = V4L2_XFER_FUNC_709; -+ break; -+ case AVCOL_TRC_IEC61966_2_1: -+ xfer = V4L2_XFER_FUNC_SRGB; -+ break; -+ case AVCOL_TRC_SMPTE240M: -+ xfer = V4L2_XFER_FUNC_SMPTE240M; -+ break; -+ case AVCOL_TRC_SMPTE2084: -+ xfer = V4L2_XFER_FUNC_SMPTE2084; -+ break; -+ default: -+ break; -+ } -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ fmt->fmt.pix_mp.colorspace = cs; -+ fmt->fmt.pix_mp.ycbcr_enc = ycbcr; -+ fmt->fmt.pix_mp.xfer_func = xfer; -+ } else { -+ fmt->fmt.pix.colorspace = cs; -+ fmt->fmt.pix.ycbcr_enc = ycbcr; -+ fmt->fmt.pix.xfer_func = xfer; -+ } -+} -+ -+static void -+set_fmt_color_range(struct v4l2_format *const fmt, const enum AVColorRange avcr) -+{ -+ const enum v4l2_quantization q = -+ avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE : -+ avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE : -+ V4L2_QUANTIZATION_DEFAULT; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ fmt->fmt.pix_mp.quantization = q; -+ } else { -+ fmt->fmt.pix.quantization = q; -+ } -+} -+ -+static enum AVColorPrimaries get_color_primaries(const struct v4l2_format *const fmt) -+{ -+ enum v4l2_ycbcr_encoding ycbcr; -+ enum v4l2_colorspace cs; -+ -+ cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.colorspace : -+ fmt->fmt.pix.colorspace; -+ -+ ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.ycbcr_enc: -+ fmt->fmt.pix.ycbcr_enc; -+ -+ switch(ycbcr) { -+ case V4L2_YCBCR_ENC_XV709: -+ case V4L2_YCBCR_ENC_709: return AVCOL_PRI_BT709; -+ case V4L2_YCBCR_ENC_XV601: -+ case V4L2_YCBCR_ENC_601:return AVCOL_PRI_BT470M; -+ default: -+ break; -+ } -+ -+ switch(cs) { -+ case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_PRI_BT470BG; -+ case V4L2_COLORSPACE_SMPTE170M: return AVCOL_PRI_SMPTE170M; -+ case V4L2_COLORSPACE_SMPTE240M: return AVCOL_PRI_SMPTE240M; -+ case V4L2_COLORSPACE_BT2020: return AVCOL_PRI_BT2020; -+ default: -+ break; -+ } -+ -+ return AVCOL_PRI_UNSPECIFIED; -+} -+ -+static enum AVColorSpace get_color_space(const struct v4l2_format *const fmt) -+{ -+ enum v4l2_ycbcr_encoding ycbcr; -+ enum v4l2_colorspace cs; -+ -+ cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.colorspace : -+ fmt->fmt.pix.colorspace; -+ -+ ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.ycbcr_enc: -+ fmt->fmt.pix.ycbcr_enc; -+ -+ switch(cs) { -+ case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB; -+ case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709; -+ case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC; -+ case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG; -+ case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M; -+ case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M; -+ case V4L2_COLORSPACE_BT2020: -+ if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM) -+ return AVCOL_SPC_BT2020_CL; -+ else -+ return AVCOL_SPC_BT2020_NCL; -+ default: -+ break; -+ } -+ -+ return AVCOL_SPC_UNSPECIFIED; -+} -+ -+static enum AVColorTransferCharacteristic get_color_trc(const struct v4l2_format *const fmt) -+{ -+ enum v4l2_ycbcr_encoding ycbcr; -+ enum v4l2_xfer_func xfer; -+ enum v4l2_colorspace cs; -+ -+ cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.colorspace : -+ fmt->fmt.pix.colorspace; -+ -+ ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.ycbcr_enc: -+ fmt->fmt.pix.ycbcr_enc; -+ -+ xfer = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.xfer_func: -+ fmt->fmt.pix.xfer_func; -+ -+ switch (xfer) { -+ case V4L2_XFER_FUNC_709: return AVCOL_TRC_BT709; -+ case V4L2_XFER_FUNC_SRGB: return AVCOL_TRC_IEC61966_2_1; -+ default: -+ break; -+ } -+ -+ switch (cs) { -+ case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_TRC_GAMMA22; -+ case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_TRC_GAMMA28; -+ case V4L2_COLORSPACE_SMPTE170M: return AVCOL_TRC_SMPTE170M; -+ case V4L2_COLORSPACE_SMPTE240M: return AVCOL_TRC_SMPTE240M; -+ default: -+ break; -+ } -+ -+ switch (ycbcr) { -+ case V4L2_YCBCR_ENC_XV709: -+ case V4L2_YCBCR_ENC_XV601: return AVCOL_TRC_BT1361_ECG; -+ default: -+ break; -+ } -+ -+ return AVCOL_TRC_UNSPECIFIED; -+} -+ -+static enum AVColorRange get_color_range(const struct v4l2_format *const fmt) -+{ -+ enum v4l2_quantization qt; -+ -+ qt = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.quantization : -+ fmt->fmt.pix.quantization; -+ -+ switch (qt) { -+ case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG; -+ case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG; -+ default: -+ break; -+ } -+ -+ return AVCOL_RANGE_UNSPECIFIED; -+} -+ -+static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame) -+{ -+ struct v4l2_format *const format = &q->format; -+ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0]; -+ -+ const uint32_t drm_fmt = src->layers[0].format; -+ // Treat INVALID as LINEAR -+ const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ? -+ DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier; -+ uint32_t pix_fmt = 0; -+ uint32_t w = 0; -+ uint32_t h = 0; -+ uint32_t bpl = src->layers[0].planes[0].pitch; -+ -+ // We really don't expect multiple layers -+ // All formats that we currently cope with are single object -+ -+ if (src->nb_layers != 1 || src->nb_objects != 1) -+ return AVERROR(EINVAL); -+ -+ switch (drm_fmt) { -+ case DRM_FORMAT_YUV420: -+ if (mod == DRM_FORMAT_MOD_LINEAR) { -+ if (src->layers[0].nb_planes != 3) -+ break; -+ pix_fmt = V4L2_PIX_FMT_YUV420; -+ h = src->layers[0].planes[1].offset / bpl; -+ w = bpl; -+ } -+ break; -+ -+ case DRM_FORMAT_NV12: -+ if (mod == DRM_FORMAT_MOD_LINEAR) { -+ if (src->layers[0].nb_planes != 2) -+ break; -+ pix_fmt = V4L2_PIX_FMT_NV12; -+ h = src->layers[0].planes[1].offset / bpl; -+ w = bpl; -+ } -+ else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { -+ if (src->layers[0].nb_planes != 2) -+ break; -+ pix_fmt = V4L2_PIX_FMT_NV12_COL128; -+ w = bpl; -+ h = src->layers[0].planes[1].offset / 128; -+ bpl = fourcc_mod_broadcom_param(mod); -+ } -+ break; -+ -+ case DRM_FORMAT_P030: -+ if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { -+ if (src->layers[0].nb_planes != 2) -+ break; -+ pix_fmt = V4L2_PIX_FMT_NV12_10_COL128; -+ w = bpl / 2; // Matching lie to how we construct this -+ h = src->layers[0].planes[1].offset / 128; -+ bpl = fourcc_mod_broadcom_param(mod); -+ } -+ break; -+ -+ default: -+ break; -+ } -+ -+ if (!pix_fmt) -+ return AVERROR(EINVAL); -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) { -+ struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp; -+ -+ pix->width = w; -+ pix->height = h; -+ pix->pixelformat = pix_fmt; -+ pix->plane_fmt[0].bytesperline = bpl; -+ pix->num_planes = 1; -+ } -+ else { -+ struct v4l2_pix_format *const pix = &format->fmt.pix; -+ -+ pix->width = w; -+ pix->height = h; -+ pix->pixelformat = pix_fmt; -+ pix->bytesperline = bpl; - } - -+ set_fmt_color(format, frame->color_primaries, frame->colorspace, frame->color_trc); -+ set_fmt_color_range(format, frame->color_range); -+ -+ q->sel.r.width = frame->width - (frame->crop_left + frame->crop_right); -+ q->sel.r.height = frame->height - (frame->crop_top + frame->crop_bottom); -+ q->sel.r.left = frame->crop_left; -+ q->sel.r.top = frame->crop_top; -+ - return 0; - } - -+ -+static int set_dst_format(DeintV4L2M2MContext * const priv, V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height) -+{ -+ struct v4l2_format * const fmt = &queue->format; -+ struct v4l2_selection *const sel = &queue->sel; -+ -+ memset(&fmt->fmt, 0, sizeof(fmt->fmt)); -+ -+ // Align w/h to 16 here in case there are alignment requirements at the next -+ // stage of the filter chain (also RPi deinterlace setup is bust and this -+ // fixes it) -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ fmt->fmt.pix_mp.pixelformat = pixelformat; -+ fmt->fmt.pix_mp.field = field; -+ fmt->fmt.pix_mp.width = FFALIGN(width, 16); -+ fmt->fmt.pix_mp.height = FFALIGN(height, 16); -+ } else { -+ fmt->fmt.pix.pixelformat = pixelformat; -+ fmt->fmt.pix.field = field; -+ fmt->fmt.pix.width = FFALIGN(width, 16); -+ fmt->fmt.pix.height = FFALIGN(height, 16); -+ } -+ -+ set_fmt_color(fmt, priv->colour_primaries, priv->colour_matrix, priv->colour_transfer); -+ set_fmt_color_range(fmt, priv->colour_range); -+ -+ sel->r.width = width; -+ sel->r.height = height; -+ sel->r.left = 0; -+ sel->r.top = 0; -+ -+ return do_s_fmt(queue); -+} -+ - static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node) - { - int ret; -@@ -464,16 +922,22 @@ static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node - return AVERROR(errno); - - ret = deint_v4l2m2m_prepare_context(ctx); -- if (ret) -+ if (ret) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to prepare context\n"); - goto fail; -+ } - -- ret = deint_v4l2m2m_try_format(&ctx->capture); -- if (ret) -+ ret = deint_v4l2m2m_try_format(&ctx->capture, ctx->output_width, ctx->output_height, ctx->output_format); -+ if (ret) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try dst format\n"); - goto fail; -+ } - -- ret = deint_v4l2m2m_try_format(&ctx->output); -- if (ret) -+ ret = deint_v4l2m2m_try_format(&ctx->output, ctx->width, ctx->height, AV_PIX_FMT_NONE); -+ if (ret) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try src format\n"); - goto fail; -+ } - - return 0; - -@@ -534,26 +998,118 @@ static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf) - return 0; - } - --static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat) -+static void -+drm_frame_init(AVDRMFrameDescriptor * const d) -+{ -+ unsigned int i; -+ for (i = 0; i != AV_DRM_MAX_PLANES; ++i) { -+ d->objects[i].fd = -1; -+ } -+} -+ -+static void -+drm_frame_uninit(AVDRMFrameDescriptor * const d) -+{ -+ unsigned int i; -+ for (i = 0; i != d->nb_objects; ++i) { -+ if (d->objects[i].fd != -1) { -+ close(d->objects[i].fd); -+ d->objects[i].fd = -1; -+ } -+ } -+} -+ -+static void -+avbufs_delete(V4L2Buffer** ppavbufs, const unsigned int n) -+{ -+ unsigned int i; -+ V4L2Buffer* const avbufs = *ppavbufs; -+ -+ if (avbufs == NULL) -+ return; -+ *ppavbufs = NULL; -+ -+ for (i = 0; i != n; ++i) { -+ V4L2Buffer* const avbuf = avbufs + i; -+ drm_frame_uninit(&avbuf->drm_frame); -+ } -+ -+ av_free(avbufs); -+} -+ -+static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf) - { - struct v4l2_exportbuffer expbuf; - int i, ret; - uint64_t mod = DRM_FORMAT_MOD_LINEAR; -- uint32_t fmt = 0; - -- switch (pixelformat) { -- case V4L2_PIX_FMT_NV12: -- fmt = DRM_FORMAT_NV12; -- break; -- case V4L2_PIX_FMT_YUV420: -- fmt = DRM_FORMAT_YUV420; -- break; -- default: -- return AVERROR(EINVAL); -+ AVDRMFrameDescriptor * const drm_desc = &avbuf->drm_frame; -+ AVDRMLayerDescriptor * const layer = &drm_desc->layers[0]; -+ const struct v4l2_format *const fmt = &q->format; -+ const uint32_t height = fmt_height(fmt); -+ const uint32_t width = fmt_width(fmt); -+ ptrdiff_t bpl0; -+ -+ /* fill the DRM frame descriptor */ -+ drm_desc->nb_layers = 1; -+ layer->nb_planes = avbuf->num_planes; -+ -+ for (int i = 0; i < avbuf->num_planes; i++) { -+ layer->planes[i].object_index = i; -+ layer->planes[i].offset = 0; -+ layer->planes[i].pitch = fmt_bpl(fmt, i); - } -+ bpl0 = layer->planes[0].pitch; -+ -+ switch (fmt_pixelformat(fmt)) { -+ -+ case V4L2_PIX_FMT_NV12_COL128: -+ mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0); -+ layer->format = V4L2_PIX_FMT_NV12; -+ -+ if (avbuf->num_planes > 1) -+ break; -+ -+ layer->nb_planes = 2; -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = height * 128; -+ layer->planes[0].pitch = width; -+ layer->planes[1].pitch = width; -+ break; - -- avbuf->drm_frame.layers[0].format = fmt; -+ case DRM_FORMAT_NV12: -+ layer->format = V4L2_PIX_FMT_NV12; - -+ if (avbuf->num_planes > 1) -+ break; -+ -+ layer->nb_planes = 2; -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = bpl0 * height; -+ layer->planes[1].pitch = bpl0; -+ break; -+ -+ case V4L2_PIX_FMT_YUV420: -+ layer->format = DRM_FORMAT_YUV420; -+ -+ if (avbuf->num_planes > 1) -+ break; -+ -+ layer->nb_planes = 3; -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = bpl0 * height; -+ layer->planes[1].pitch = bpl0 / 2; -+ layer->planes[2].object_index = 0; -+ layer->planes[2].offset = layer->planes[1].offset + ((bpl0 * height) / 4); -+ layer->planes[2].pitch = bpl0 / 2; -+ break; -+ -+ default: -+ drm_desc->nb_layers = 0; -+ return AVERROR(EINVAL); -+ } -+ -+ drm_desc->nb_objects = 0; - for (i = 0; i < avbuf->num_planes; i++) { - memset(&expbuf, 0, sizeof(expbuf)); - -@@ -565,19 +1121,11 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat) - if (ret < 0) - return AVERROR(errno); - -- avbuf->fd = expbuf.fd; -- -- if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) { -- /* drm frame */ -- avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length; -- avbuf->drm_frame.objects[i].fd = expbuf.fd; -- avbuf->drm_frame.objects[i].format_modifier = mod; -- } else { -- /* drm frame */ -- avbuf->drm_frame.objects[0].size = avbuf->buffer.length; -- avbuf->drm_frame.objects[0].fd = expbuf.fd; -- avbuf->drm_frame.objects[0].format_modifier = mod; -- } -+ drm_desc->objects[i].size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type) ? -+ avbuf->buffer.m.planes[i].length : avbuf->buffer.length; -+ drm_desc->objects[i].fd = expbuf.fd; -+ drm_desc->objects[i].format_modifier = mod; -+ drm_desc->nb_objects = i + 1; - } - - return 0; -@@ -588,7 +1136,7 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) - struct v4l2_format *fmt = &queue->format; - DeintV4L2M2MContextShared *ctx = queue->ctx; - struct v4l2_requestbuffers req; -- int ret, i, j, multiplanar; -+ int ret, i, multiplanar; - uint32_t memory; - - memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ? -@@ -617,10 +1165,9 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) - } - - for (i = 0; i < queue->num_buffers; i++) { -- V4L2Buffer *buf = &queue->buffers[i]; -+ V4L2Buffer * const buf = &queue->buffers[i]; - - buf->enqueued = 0; -- buf->fd = -1; - buf->q = queue; - - buf->buffer.type = fmt->type; -@@ -632,6 +1179,12 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) - buf->buffer.m.planes = buf->planes; - } - -+ drm_frame_init(&buf->drm_frame); -+ } -+ -+ for (i = 0; i < queue->num_buffers; i++) { -+ V4L2Buffer * const buf = &queue->buffers[i]; -+ - ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer); - if (ret < 0) { - ret = AVERROR(errno); -@@ -639,29 +1192,14 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) - goto fail; - } - -- if (multiplanar) -- buf->num_planes = buf->buffer.length; -- else -- buf->num_planes = 1; -- -- for (j = 0; j < buf->num_planes; j++) { -- V4L2PlaneInfo *info = &buf->plane_info[j]; -- -- if (multiplanar) { -- info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline; -- info->length = buf->buffer.m.planes[j].length; -- } else { -- info->bytesperline = fmt->fmt.pix.bytesperline; -- info->length = buf->buffer.length; -- } -- } -+ buf->num_planes = multiplanar ? buf->buffer.length : 1; - - if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) { - ret = deint_v4l2m2m_enqueue_buffer(buf); - if (ret) - goto fail; - -- ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat); -+ ret = v4l2_buffer_export_drm(queue, buf); - if (ret) - goto fail; - } -@@ -670,12 +1208,8 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) - return 0; - - fail: -- for (i = 0; i < queue->num_buffers; i++) -- if (queue->buffers[i].fd >= 0) -- close(queue->buffers[i].fd); -- av_free(queue->buffers); -- queue->buffers = NULL; -- -+ avbufs_delete(&queue->buffers, queue->num_buffers); -+ queue->num_buffers = 0; - return ret; - } - -@@ -862,7 +1396,6 @@ static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) - if (atomic_fetch_sub(&ctx->refcount, 1) == 1) { - V4L2Queue *capture = &ctx->capture; - V4L2Queue *output = &ctx->output; -- int i; - - av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__); - -@@ -871,12 +1404,7 @@ static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) - deint_v4l2m2m_streamoff(output); - } - -- if (capture->buffers) -- for (i = 0; i < capture->num_buffers; i++) { -- capture->buffers[i].q = NULL; -- if (capture->buffers[i].fd >= 0) -- close(capture->buffers[i].fd); -- } -+ avbufs_delete(&capture->buffers, capture->num_buffers); - - deint_v4l2m2m_unref_queued(output); - -@@ -908,73 +1436,15 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused) - deint_v4l2m2m_destroy_context(ctx); - } - --static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) --{ -- AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; -- AVDRMLayerDescriptor *layer; -- -- /* fill the DRM frame descriptor */ -- drm_desc->nb_objects = avbuf->num_planes; -- drm_desc->nb_layers = 1; -- -- layer = &drm_desc->layers[0]; -- layer->nb_planes = avbuf->num_planes; -- -- for (int i = 0; i < avbuf->num_planes; i++) { -- layer->planes[i].object_index = i; -- layer->planes[i].offset = 0; -- layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; -- } -- -- switch (layer->format) { -- case DRM_FORMAT_YUYV: -- layer->nb_planes = 1; -- break; -- -- case DRM_FORMAT_NV12: -- case DRM_FORMAT_NV21: -- if (avbuf->num_planes > 1) -- break; -- -- layer->nb_planes = 2; -- -- layer->planes[1].object_index = 0; -- layer->planes[1].offset = avbuf->plane_info[0].bytesperline * -- height; -- layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; -- break; -- -- case DRM_FORMAT_YUV420: -- if (avbuf->num_planes > 1) -- break; -- -- layer->nb_planes = 3; -- -- layer->planes[1].object_index = 0; -- layer->planes[1].offset = avbuf->plane_info[0].bytesperline * -- height; -- layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1; -- -- layer->planes[2].object_index = 0; -- layer->planes[2].offset = layer->planes[1].offset + -- ((avbuf->plane_info[0].bytesperline * -- height) >> 2); -- layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1; -- break; -- -- default: -- drm_desc->nb_layers = 0; -- break; -- } -- -- return (uint8_t *) drm_desc; --} -- - // timeout in ms - static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout) - { - DeintV4L2M2MContextShared *ctx = queue->ctx; - V4L2Buffer* avbuf; -+ enum AVColorPrimaries color_primaries; -+ enum AVColorSpace colorspace; -+ enum AVColorTransferCharacteristic color_trc; -+ enum AVColorRange color_range; - - av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__); - -@@ -985,8 +1455,6 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim - } - - // Fill in PTS and anciliary info from src frame -- // we will want to overwrite some fields as only the pts/dts -- // fields are updated with new timing in this fn - pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame); - - frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame, -@@ -999,18 +1467,36 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim - - atomic_fetch_add(&ctx->refcount, 1); - -- frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height); -+ frame->data[0] = (uint8_t *)&avbuf->drm_frame; - frame->format = AV_PIX_FMT_DRM_PRIME; - if (ctx->hw_frames_ctx) - frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx); -- frame->height = ctx->height; -- frame->width = ctx->width; -- -- // Not interlaced now -- frame->interlaced_frame = 0; -- frame->top_field_first = 0; -- // Pkt duration halved -- frame->pkt_duration /= 2; -+ frame->height = ctx->output_height; -+ frame->width = ctx->output_width; -+ -+ color_primaries = get_color_primaries(&ctx->capture.format); -+ colorspace = get_color_space(&ctx->capture.format); -+ color_trc = get_color_trc(&ctx->capture.format); -+ color_range = get_color_range(&ctx->capture.format); -+ -+ // If the color parameters are unspecified by V4L2 then leave alone as they -+ // will have been copied from src -+ if (color_primaries != AVCOL_PRI_UNSPECIFIED) -+ frame->color_primaries = color_primaries; -+ if (colorspace != AVCOL_SPC_UNSPECIFIED) -+ frame->colorspace = colorspace; -+ if (color_trc != AVCOL_TRC_UNSPECIFIED) -+ frame->color_trc = color_trc; -+ if (color_range != AVCOL_RANGE_UNSPECIFIED) -+ frame->color_range = color_range; -+ -+ if (ctx->filter_type == FILTER_V4L2_DEINTERLACE) { -+ // Not interlaced now -+ frame->interlaced_frame = 0; // *** Fill in from dst buffer? -+ frame->top_field_first = 0; -+ // Pkt duration halved -+ frame->pkt_duration /= 2; -+ } - - if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) { - av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n"); -@@ -1032,15 +1518,34 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink) - ctx->height = avctx->inputs[0]->h; - ctx->width = avctx->inputs[0]->w; - -- av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height); -+ if (ctx->filter_type == FILTER_V4L2_SCALE) { -+ if ((ret = ff_scale_eval_dimensions(priv, -+ priv->w_expr, priv->h_expr, -+ inlink, outlink, -+ &ctx->output_width, &ctx->output_height)) < 0) -+ return ret; -+ -+ ff_scale_adjust_dimensions(inlink, &ctx->output_width, &ctx->output_height, -+ priv->force_original_aspect_ratio, priv->force_divisible_by); -+ } -+ else { -+ ctx->output_width = ctx->width; -+ ctx->output_height = ctx->height; -+ } -+ -+ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d\n", __func__, ctx->width, ctx->height, ctx->output_width, ctx->output_height); - - outlink->time_base = inlink->time_base; -- outlink->w = inlink->w; -- outlink->h = inlink->h; -- outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; -+ outlink->w = ctx->output_width; -+ outlink->h = ctx->output_height; - outlink->format = inlink->format; - outlink->frame_rate = (AVRational) {1, 0}; // Deny knowledge of frame rate - -+ if (inlink->sample_aspect_ratio.num) -+ outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio); -+ else -+ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; -+ - ret = deint_v4l2m2m_find_device(ctx); - if (ret) - return ret; -@@ -1055,18 +1560,19 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink) - - static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc) - { -- const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR || -- drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID); -+ const uint64_t mod = drm_desc->objects[0].format_modifier; -+ const int is_linear = (mod == DRM_FORMAT_MOD_LINEAR || mod == DRM_FORMAT_MOD_INVALID); -+ -+ // Only currently support single object things -+ if (drm_desc->nb_objects != 1) -+ return 0; - - switch (drm_desc->layers[0].format) { - case DRM_FORMAT_YUV420: -- if (is_linear) -- return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0; -- break; -+ return is_linear ? V4L2_PIX_FMT_YUV420 : 0; - case DRM_FORMAT_NV12: -- if (is_linear) -- return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0; -- break; -+ return is_linear ? V4L2_PIX_FMT_NV12 : -+ fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 : 0; - default: - break; - } -@@ -1089,7 +1595,7 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) - - if (ctx->field_order == V4L2_FIELD_ANY) { - const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0]; -- const uint32_t pixelformat = desc_pixelformat(drm_desc); -+ uint32_t pixelformat = desc_pixelformat(drm_desc); - - if (pixelformat == 0) { - av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n", -@@ -1104,29 +1610,49 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) - av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height, - drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset); - -- ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); -- if (ret) -+ if ((ret = set_src_fmt(output, in)) != 0) { -+ av_log(avctx, AV_LOG_WARNING, "Unknown input DRM format: %s mod: %#" PRIx64 "\n", -+ av_fourcc2str(drm_desc->layers[0].format), drm_desc->objects[0].format_modifier); -+ return ret; -+ } -+ -+ ret = do_s_fmt(output); -+ if (ret) { -+ av_log(avctx, AV_LOG_WARNING, "Failed to set source format\n"); - return ret; -+ } - -- ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); -- if (ret) -+ if (ctx->output_format != AV_PIX_FMT_NONE) -+ pixelformat = fmt_av_to_v4l2(ctx->output_format); -+ ret = set_dst_format(priv, capture, pixelformat, V4L2_FIELD_NONE, ctx->output_width, ctx->output_height); -+ if (ret) { -+ av_log(avctx, AV_LOG_WARNING, "Failed to set destination format\n"); - return ret; -+ } - - ret = deint_v4l2m2m_allocate_buffers(capture); -- if (ret) -+ if (ret) { -+ av_log(avctx, AV_LOG_WARNING, "Failed to allocate destination buffers\n"); - return ret; -+ } - - ret = deint_v4l2m2m_streamon(capture); -- if (ret) -+ if (ret) { -+ av_log(avctx, AV_LOG_WARNING, "Failed set destination streamon: %s\n", av_err2str(ret)); - return ret; -+ } - - ret = deint_v4l2m2m_allocate_buffers(output); -- if (ret) -+ if (ret) { -+ av_log(avctx, AV_LOG_WARNING, "Failed to allocate src buffers\n"); - return ret; -+ } - - ret = deint_v4l2m2m_streamon(output); -- if (ret) -+ if (ret) { -+ av_log(avctx, AV_LOG_WARNING, "Failed set src streamon: %s\n", av_err2str(ret)); - return ret; -+ } - - if (in->top_field_first) - ctx->field_order = V4L2_FIELD_INTERLACED_TB; -@@ -1251,7 +1777,7 @@ again: - return did_something ? 0 : FFERROR_NOT_READY; - } - --static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) -+static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filter_type_v4l2_t filter_type) - { - DeintV4L2M2MContext * const priv = avctx->priv; - DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared)); -@@ -1262,6 +1788,7 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) - } - priv->shared = ctx; - ctx->logctx = priv; -+ ctx->filter_type = filter_type; - ctx->fd = -1; - ctx->output.ctx = ctx; - ctx->output.num_buffers = 8; -@@ -1274,9 +1801,52 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) - - atomic_init(&ctx->refcount, 1); - -+ if (priv->output_format_string) { -+ ctx->output_format = av_get_pix_fmt(priv->output_format_string); -+ if (ctx->output_format == AV_PIX_FMT_NONE) { -+ av_log(avctx, AV_LOG_ERROR, "Invalid ffmpeg output format '%s'.\n", priv->output_format_string); -+ return AVERROR(EINVAL); -+ } -+ if (fmt_av_to_v4l2(ctx->output_format) == 0) { -+ av_log(avctx, AV_LOG_ERROR, "Unsupported output format for V4L2: %s.\n", av_get_pix_fmt_name(ctx->output_format)); -+ return AVERROR(EINVAL); -+ } -+ } else { -+ // Use the input format once that is configured. -+ ctx->output_format = AV_PIX_FMT_NONE; -+ } -+ -+#define STRING_OPTION(var_name, func_name, default_value) do { \ -+ if (priv->var_name ## _string) { \ -+ int var = av_ ## func_name ## _from_name(priv->var_name ## _string); \ -+ if (var < 0) { \ -+ av_log(avctx, AV_LOG_ERROR, "Invalid %s.\n", #var_name); \ -+ return AVERROR(EINVAL); \ -+ } \ -+ priv->var_name = var; \ -+ } else { \ -+ priv->var_name = default_value; \ -+ } \ ++#include "libavutil/intreadwrite.h" ++#include "libavutil/mem_internal.h" ++ ++#define IDCTDSP_TEST(func) { #func, offsetof(IDCTDSPContext, func) }, ++ ++typedef struct { ++ const char *name; ++ size_t offset; ++} test; ++ ++#define RANDOMIZE_BUFFER16(name, size) \ ++ do { \ ++ int i; \ ++ for (i = 0; i < size; ++i) { \ ++ uint16_t r = rnd() % 0x201 - 0x100; \ ++ AV_WN16A(name##0 + i, r); \ ++ AV_WN16A(name##1 + i, r); \ ++ } \ + } while (0) + -+ STRING_OPTION(colour_primaries, color_primaries, AVCOL_PRI_UNSPECIFIED); -+ STRING_OPTION(colour_transfer, color_transfer, AVCOL_TRC_UNSPECIFIED); -+ STRING_OPTION(colour_matrix, color_space, AVCOL_SPC_UNSPECIFIED); -+ STRING_OPTION(chroma_location, chroma_location, AVCHROMA_LOC_UNSPECIFIED); ++#define RANDOMIZE_BUFFER8(name, size) \ ++ do { \ ++ int i; \ ++ for (i = 0; i < size; ++i) { \ ++ uint8_t r = rnd(); \ ++ name##0[i] = r; \ ++ name##1[i] = r; \ ++ } \ ++ } while (0) + - return 0; - } - -+static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) ++static void check_add_put_clamped(void) +{ -+ return common_v4l2m2m_init(avctx, FILTER_V4L2_DEINTERLACE); -+} -+ -+static av_cold int scale_v4l2m2m_init(AVFilterContext *avctx) -+{ -+ return common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE); -+} -+ - static void deint_v4l2m2m_uninit(AVFilterContext *avctx) - { - DeintV4L2M2MContext *priv = avctx->priv; -@@ -1294,6 +1864,51 @@ static const AVOption deinterlace_v4l2m2m_options[] = { - - AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m); - -+#define OFFSET(x) offsetof(DeintV4L2M2MContext, x) -+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM) -+ -+static const AVOption scale_v4l2m2m_options[] = { -+ { "w", "Output video width", -+ OFFSET(w_expr), AV_OPT_TYPE_STRING, {.str = "iw"}, .flags = FLAGS }, -+ { "h", "Output video height", -+ OFFSET(h_expr), AV_OPT_TYPE_STRING, {.str = "ih"}, .flags = FLAGS }, -+ { "format", "Output video format (software format of hardware frames)", -+ OFFSET(output_format_string), AV_OPT_TYPE_STRING, .flags = FLAGS }, -+ // These colour properties match the ones of the same name in vf_scale. -+ { "out_color_matrix", "Output colour matrix coefficient set", -+ OFFSET(colour_matrix_string), AV_OPT_TYPE_STRING, { .str = NULL }, .flags = FLAGS }, -+ { "out_range", "Output colour range", -+ OFFSET(colour_range), AV_OPT_TYPE_INT, { .i64 = AVCOL_RANGE_UNSPECIFIED }, -+ AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_JPEG, FLAGS, "range" }, -+ { "full", "Full range", -+ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" }, -+ { "limited", "Limited range", -+ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" }, -+ { "jpeg", "Full range", -+ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" }, -+ { "mpeg", "Limited range", -+ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" }, -+ { "tv", "Limited range", -+ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" }, -+ { "pc", "Full range", -+ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" }, -+ // These colour properties match the ones in the VAAPI scaler -+ { "out_color_primaries", "Output colour primaries", -+ OFFSET(colour_primaries_string), AV_OPT_TYPE_STRING, -+ { .str = NULL }, .flags = FLAGS }, -+ { "out_color_transfer", "Output colour transfer characteristics", -+ OFFSET(colour_transfer_string), AV_OPT_TYPE_STRING, -+ { .str = NULL }, .flags = FLAGS }, -+ { "out_chroma_location", "Output chroma sample location", -+ OFFSET(chroma_location_string), AV_OPT_TYPE_STRING, -+ { .str = NULL }, .flags = FLAGS }, -+ { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" }, -+ { "force_divisible_by", "enforce that the output resolution is divisible by a defined integer when force_original_aspect_ratio is used", OFFSET(force_divisible_by), AV_OPT_TYPE_INT, { .i64 = 1}, 1, 256, FLAGS }, -+ { NULL }, -+}; -+ -+AVFILTER_DEFINE_CLASS(scale_v4l2m2m); -+ - static const AVFilterPad deint_v4l2m2m_inputs[] = { - { - .name = "default", -@@ -1321,3 +1936,17 @@ AVFilter ff_vf_deinterlace_v4l2m2m = { - .priv_class = &deinterlace_v4l2m2m_class, - .activate = deint_v4l2m2m_activate, - }; -+ -+AVFilter ff_vf_scale_v4l2m2m = { -+ .name = "scale_v4l2m2m", -+ .description = NULL_IF_CONFIG_SMALL("V4L2 M2M scaler"), -+ .priv_size = sizeof(DeintV4L2M2MContext), -+ .init = &scale_v4l2m2m_init, -+ .uninit = &deint_v4l2m2m_uninit, -+ FILTER_INPUTS(deint_v4l2m2m_inputs), -+ FILTER_OUTPUTS(deint_v4l2m2m_outputs), -+ FILTER_SINGLE_SAMPLEFMT(AV_PIX_FMT_DRM_PRIME), -+ .priv_class = &scale_v4l2m2m_class, -+ .activate = deint_v4l2m2m_activate, -+}; -+ - -From 7e7147d50bc6e3f13834525dba3a47d170422f07 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 22 Sep 2022 14:54:46 +0000 -Subject: [PATCH 071/136] v4l2_m2m: Adjust buffer allocation based on min/max - controls - -Clip requested buffer count to min/max declared by driver. -If 0 buffers requested then set to min+2. -This allows encode to keep its src buffer count down to a plausible -minimum which helps with flow control. ---- - libavcodec/v4l2_context.c | 19 +++++++++++++++++++ - 1 file changed, 19 insertions(+) - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index 6b97eab41e..ba36689ff3 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -1187,6 +1187,7 @@ fail_release: - - int ff_v4l2_context_init(V4L2Context* ctx) - { -+ struct v4l2_queryctrl qctrl; - V4L2m2mContext * const s = ctx_to_m2mctx(ctx); - int ret; - -@@ -1228,6 +1229,24 @@ int ff_v4l2_context_init(V4L2Context* ctx) - goto fail_unref_hwframes; - } - -+ memset(&qctrl, 0, sizeof(qctrl)); -+ qctrl.id = V4L2_CID_MIN_BUFFERS_FOR_OUTPUT; -+ if (ioctl(s->fd, VIDIOC_QUERYCTRL, &qctrl) != 0) { -+ ret = AVERROR(errno); -+ if (ret != AVERROR(EINVAL)) { -+ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_QUERCTRL failed: %s\n", ctx->name, av_err2str(ret)); -+ goto fail_unref_hwframes; -+ } -+ // Control unsupported - set default if wanted -+ if (ctx->num_buffers < 2) -+ ctx->num_buffers = 4; -+ } -+ else { -+ if (ctx->num_buffers < 2) -+ ctx->num_buffers = qctrl.minimum + 2; -+ ctx->num_buffers = av_clip(ctx->num_buffers, qctrl.minimum, qctrl.maximum); -+ } -+ - ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem); - if (ret < 0) - goto fail_unref_hwframes; - -From b69a2707a192ac509174899233a094373a3f5dc9 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 22 Sep 2022 15:00:12 +0000 -Subject: [PATCH 072/136] v4l2_m2m_dec: If src Q is full then wait indefinitely - for buffer - -If it is not possible to add another buffer to the src Q then alawys -wait indefinitely for either an output frame or the Q to have space. - -This has issues if the reason that the Q is stalled is due to dst buffer -exhaustion and buffers cannot be returned async by another thread but -the current scheme confuses ffmpegs pipeline scheduling. ---- - libavcodec/v4l2_m2m_dec.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 485a96f4b4..bb183097f6 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -456,9 +456,9 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - if (dst_rv != 0 && TRY_DQ(src_rv)) { - // Pick a timeout depending on state - const int t = -+ src_rv == NQ_Q_FULL ? -1 : - src_rv == NQ_DRAINING ? 300 : -- prefer_dq ? 5 : -- src_rv == NQ_Q_FULL ? -1 : 0; -+ prefer_dq ? 5 : 0; - - // Dequeue frame will unref any previous contents of frame - // if it returns success so we don't need an explicit unref - -From b1d37be81bbf683a0eb16923c9b9f045fd0ea0c0 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 22 Sep 2022 15:12:27 +0000 -Subject: [PATCH 073/136] vf_deinterlace_v4l2m2m: Add Q name to structure for - debug - ---- - libavfilter/vf_deinterlace_v4l2m2m.c | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -index 2df39ec0f1..4edecc02bf 100644 ---- a/libavfilter/vf_deinterlace_v4l2m2m.c -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -84,6 +84,7 @@ typedef struct V4L2Queue { - struct v4l2_selection sel; - int num_buffers; - V4L2Buffer *buffers; -+ const char * name; - DeintV4L2M2MContextShared *ctx; - } V4L2Queue; - -@@ -1792,8 +1793,10 @@ static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filt - ctx->fd = -1; - ctx->output.ctx = ctx; - ctx->output.num_buffers = 8; -+ ctx->output.name = "OUTPUT"; - ctx->capture.ctx = ctx; - ctx->capture.num_buffers = 12; -+ ctx->capture.name = "CAPTURE"; - ctx->done = 0; - ctx->field_order = V4L2_FIELD_ANY; - - -From 794a5bfc3ec74fdc7664508a287a075708d5deef Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 22 Sep 2022 16:08:42 +0000 -Subject: [PATCH 074/136] v4l2_m2m_enc: Set src buffer count to min+2 by - default - -Set output.num_buffers to 0 by default which will then be set to min+2 -by the allocation code. This fixes an issue where the deinterlacer had -fewer dest buffer than the encoder has src buffers and so ran dry -creating deadlock in the ffmpeg filter chain. ---- - libavcodec/v4l2_m2m_enc.c | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c -index 099ad23928..b8ba815c37 100644 ---- a/libavcodec/v4l2_m2m_enc.c -+++ b/libavcodec/v4l2_m2m_enc.c -@@ -672,9 +672,10 @@ static av_cold int v4l2_encode_close(AVCodecContext *avctx) - #define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM - - #define V4L_M2M_CAPTURE_OPTS \ -- V4L_M2M_DEFAULT_OPTS,\ -+ { "num_output_buffers", "Number of buffers in the output context",\ -+ OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },\ - { "num_capture_buffers", "Number of buffers in the capture context", \ -- OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 4 }, 4, INT_MAX, FLAGS } -+ OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 8 }, 8, INT_MAX, FLAGS } - - static const AVOption mpeg4_options[] = { - V4L_M2M_CAPTURE_OPTS, - -From 85c42743046a05b347f33b1933e6d52ea1d17e00 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 22 Sep 2022 16:13:57 +0000 -Subject: [PATCH 075/136] vf_deinterlace_m2m: For deinterlace set outlink FR to - twice inlink - -We used to set the outlink framerate to unknown but it turns out that -ffmpegs filter pipeline copes with that badly. Otherwise leave at 0,0 -which will copy FR from inlink to outlink. ---- - libavfilter/vf_deinterlace_v4l2m2m.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -index 4edecc02bf..c52dae1c44 100644 ---- a/libavfilter/vf_deinterlace_v4l2m2m.c -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -1534,13 +1534,16 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink) - ctx->output_height = ctx->height; - } - -- av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d\n", __func__, ctx->width, ctx->height, ctx->output_width, ctx->output_height); -+ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d FR: %d/%d->%d/%d\n", __func__, -+ ctx->width, ctx->height, ctx->output_width, ctx->output_height, -+ inlink->frame_rate.num, inlink->frame_rate.den, outlink->frame_rate.num, outlink->frame_rate.den); - - outlink->time_base = inlink->time_base; - outlink->w = ctx->output_width; - outlink->h = ctx->output_height; - outlink->format = inlink->format; -- outlink->frame_rate = (AVRational) {1, 0}; // Deny knowledge of frame rate -+ if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && inlink->frame_rate.den != 0) -+ outlink->frame_rate = (AVRational){inlink->frame_rate.num * 2, inlink->frame_rate.den}; - - if (inlink->sample_aspect_ratio.num) - outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio); - -From 34a24bc0b0d427c75659d3907cb75afb6a9dc255 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Fri, 23 Sep 2022 11:30:56 +0000 -Subject: [PATCH 076/136] v4l2m2m: Add ff_v4l2_dq_all to drain all buffers from - a Q - -Useful for where (encode) we might have drmprime buffers that we want to -return to the source ASAP. ---- - libavcodec/v4l2_context.c | 17 +++++++++++------ - libavcodec/v4l2_context.h | 2 ++ - 2 files changed, 13 insertions(+), 6 deletions(-) - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index ba36689ff3..4a359bf45e 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -707,17 +707,22 @@ clean_v4l2_buffer(V4L2Buffer * const avbuf) - return avbuf; - } - -+void -+ff_v4l2_dq_all(V4L2Context *const ctx) -+{ -+ V4L2Buffer * avbuf; -+ do { -+ get_qbuf(ctx, &avbuf, 0); -+ } while (avbuf); -+} -+ - static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) - { - int i; - - /* get back as many output buffers as possible */ -- if (V4L2_TYPE_IS_OUTPUT(ctx->type)) { -- V4L2Buffer * avbuf; -- do { -- get_qbuf(ctx, &avbuf, 0); -- } while (avbuf); -- } -+ if (V4L2_TYPE_IS_OUTPUT(ctx->type)) -+ ff_v4l2_dq_all(ctx); - - for (i = 0; i < ctx->num_buffers; i++) { - V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h -index 21265f1bd7..523c53e97d 100644 ---- a/libavcodec/v4l2_context.h -+++ b/libavcodec/v4l2_context.h -@@ -218,4 +218,6 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const - */ - int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f); - -+void ff_v4l2_dq_all(V4L2Context *const ctx); -+ - #endif // AVCODEC_V4L2_CONTEXT_H - -From 95dfc168c74f7b0f282c1b2ad9deb8fba10a7ce5 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Fri, 23 Sep 2022 11:38:36 +0000 -Subject: [PATCH 077/136] v4l2_m2m_enc: DQ output more frequently - -Ensure that we DQ any released src buffers on every op to avoid deadlock -with source. - -There is a plausible argument that this patch is inelegant and the drain -should be integrated into dq_buf, but that is a further reaching delta. ---- - libavcodec/v4l2_m2m_enc.c | 12 ++++++++++-- - 1 file changed, 10 insertions(+), 2 deletions(-) - -diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c -index b8ba815c37..a992a3cccc 100644 ---- a/libavcodec/v4l2_m2m_enc.c -+++ b/libavcodec/v4l2_m2m_enc.c -@@ -421,6 +421,8 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) - V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; - V4L2Context *const output = &s->output; - -+ ff_v4l2_dq_all(output); -+ - // Signal EOF if needed - if (!frame) { - return ff_v4l2_context_enqueue_frame(output, frame); -@@ -492,6 +494,8 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) - AVFrame *frame = s->frame; - int ret; - -+ ff_v4l2_dq_all(output); -+ - if (s->draining) - goto dequeue; - -@@ -528,7 +532,9 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) - } - - dequeue: -- if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0) -+ ret = ff_v4l2_context_dequeue_packet(capture, avpkt); -+ ff_v4l2_dq_all(output); -+ if (ret) - return ret; - - if (capture->first_buf == 1) { -@@ -560,7 +566,9 @@ dequeue: - s->extdata_size = len; - } - -- if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0) -+ ret = ff_v4l2_context_dequeue_packet(capture, avpkt); -+ ff_v4l2_dq_all(output); -+ if (ret) - return ret; - } - - -From a40b1c38b0615fce0c0d9eb97510ab9e77b3e1ac Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 26 Sep 2022 18:20:00 +0100 -Subject: [PATCH 078/136] conf_native: Remove --enable-rpi from all builds - ---- - pi-util/conf_native.sh | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh -index 37cea71756..f22d531ca4 100755 ---- a/pi-util/conf_native.sh -+++ b/pi-util/conf_native.sh -@@ -54,9 +54,9 @@ if [ $MMAL ]; then - RPI_LIBDIRS="-L$RPI_OPT_VC/lib" - RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000" - RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group" -- RPIOPTS="--enable-mmal --enable-rpi" -+ RPIOPTS="--enable-mmal" - else -- RPIOPTS="--disable-mmal --enable-sand" -+ RPIOPTS="--disable-mmal" - fi - - C=`lsb_release -sc` -@@ -89,6 +89,7 @@ $FFSRC/configure \ - $MCOPTS\ - --disable-stripping\ - --disable-thumb\ -+ --enable-sand\ - --enable-v4l2-request\ - --enable-libdrm\ - --enable-vout-egl\ - -From 8fddfc8f1e3c95caded18705ed29be0ae95517bc Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 29 Sep 2022 19:48:08 +0000 -Subject: [PATCH 079/136] v4l2_m2m_dec: Deal correctly with avcC H264 data in - extradata - -Decoders expect AnnexB style headers, mkv and similar formats have -somewhat oddly wrapped extradata. Convert to annex-b style before use. ---- - libavcodec/v4l2_m2m.h | 2 +- - libavcodec/v4l2_m2m_dec.c | 177 ++++++++++++++++++++++++++++++++++++-- - 2 files changed, 169 insertions(+), 10 deletions(-) - -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index ee72beb052..babf101d65 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -118,7 +118,7 @@ typedef struct V4L2m2mContext { - /* Ext data sent */ - int extdata_sent; - /* Ext data sent in packet - overrides ctx */ -- uint8_t * extdata_data; -+ void * extdata_data; - size_t extdata_size; - - #define FF_V4L2_QUIRK_REINIT_ALWAYS 1 -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index bb183097f6..6bd9926b3f 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -46,6 +46,71 @@ - #define STATS_LAST_COUNT_MAX 64 - #define STATS_INTERVAL_MAX (1 << 30) - -+#ifndef FF_API_BUFFER_SIZE_T -+#define FF_API_BUFFER_SIZE_T 1 -+#endif -+ -+#define DUMP_FAILED_EXTRADATA 0 -+ -+#if DUMP_FAILED_EXTRADATA -+static inline char hex1(unsigned int x) -+{ -+ x &= 0xf; -+ return x <= 9 ? '0' + x : 'a' + x - 10; -+} -+ -+static inline char * hex2(char * s, unsigned int x) -+{ -+ *s++ = hex1(x >> 4); -+ *s++ = hex1(x); -+ return s; -+} -+ -+static inline char * hex4(char * s, unsigned int x) -+{ -+ s = hex2(s, x >> 8); -+ s = hex2(s, x); -+ return s; -+} -+ -+static inline char * dash2(char * s) -+{ -+ *s++ = '-'; -+ *s++ = '-'; -+ return s; -+} -+ -+static void -+data16(char * s, const unsigned int offset, const uint8_t * m, const size_t len) -+{ -+ size_t i; -+ s = hex4(s, offset); -+ m += offset; -+ for (i = 0; i != 8; ++i) { -+ *s++ = ' '; -+ s = len > i + offset ? hex2(s, *m++) : dash2(s); -+ } -+ *s++ = ' '; -+ *s++ = ':'; -+ for (; i != 16; ++i) { -+ *s++ = ' '; -+ s = len > i + offset ? hex2(s, *m++) : dash2(s); -+ } -+ *s++ = 0; -+} -+ -+static void -+log_dump(void * logctx, int lvl, const void * const data, const size_t len) -+{ -+ size_t i; -+ for (i = 0; i < len; i += 16) { -+ char buf[80]; -+ data16(buf, i, data, len); -+ av_log(logctx, lvl, "%s\n", buf); -+ } -+} -+#endif -+ - static int64_t pts_stats_guess(const pts_stats_t * const stats) - { - if (stats->last_pts == AV_NOPTS_VALUE || -@@ -98,6 +163,98 @@ static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char - }; - } - -+// If abdata == NULL then this just counts space required -+// Unpacks avcC if detected -+static int -+h264_xd_copy(const uint8_t * const extradata, const int extrasize, uint8_t * abdata) -+{ -+ const uint8_t * const xdend = extradata + extrasize; -+ const uint8_t * p = extradata; -+ uint8_t * d = abdata; -+ unsigned int n; -+ unsigned int len; -+ const unsigned int hdrlen = 4; -+ unsigned int need_pps = 1; -+ -+ if (extrasize < 8) -+ return AVERROR(EINVAL); -+ -+ if (p[0] == 0 && p[1] == 0) { -+ // Assume a couple of leading zeros are good enough to indicate NAL -+ if (abdata) -+ memcpy(d, p, extrasize); -+ return extrasize; -+ } -+ -+ // avcC starts with a 1 -+ if (p[0] != 1) -+ return AVERROR(EINVAL); -+ -+ p += 5; -+ n = *p++ & 0x1f; -+ -+doxps: -+ while (n--) { -+ if (xdend - p < 2) -+ return AVERROR(EINVAL); -+ len = (p[0] << 8) | p[1]; -+ p += 2; -+ if (xdend - p < (ptrdiff_t)len) -+ return AVERROR(EINVAL); -+ if (abdata) { -+ d[0] = 0; -+ d[1] = 0; -+ d[2] = 0; -+ d[3] = 1; -+ memcpy(d + 4, p, len); -+ } -+ d += len + hdrlen; -+ p += len; -+ } -+ if (need_pps) { -+ need_pps = 0; -+ if (p >= xdend) -+ return AVERROR(EINVAL); -+ n = *p++; -+ goto doxps; -+ } -+ -+ return d - abdata; -+} -+ -+static int -+copy_extradata(AVCodecContext * const avctx, -+ const void * const src_data, const int src_len, -+ void ** const pdst_data, size_t * const pdst_len) -+{ -+ int len; -+ -+ *pdst_len = 0; -+ av_freep(pdst_data); -+ -+ if (avctx->codec_id == AV_CODEC_ID_H264) -+ len = h264_xd_copy(src_data, src_len, NULL); -+ else -+ len = src_len < 0 ? AVERROR(EINVAL) : src_len; -+ -+ // Zero length is OK but we swant to stop - -ve is error val -+ if (len <= 0) -+ return len; -+ -+ if ((*pdst_data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL) -+ return AVERROR(ENOMEM); -+ -+ if (avctx->codec_id == AV_CODEC_ID_H264) -+ h264_xd_copy(src_data, src_len, *pdst_data); -+ else -+ memcpy(*pdst_data, src_data, len); -+ *pdst_len = len; -+ -+ return 0; -+} -+ -+ -+ - static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s) - { - int ret; -@@ -277,13 +434,8 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const - side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size); - if (side_data) { - av_log(avctx, AV_LOG_DEBUG, "New extradata\n"); -- av_freep(&s->extdata_data); -- if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) { -- av_log(avctx, AV_LOG_ERROR, "Failed to alloc %zd bytes of extra data\n", side_size); -- return AVERROR(ENOMEM); -- } -- memcpy(s->extdata_data, side_data, side_size); -- s->extdata_size = side_size; -+ if ((ret = copy_extradata(avctx, side_data, (int)side_size, &s->extdata_data, &s->extdata_size)) < 0) -+ av_log(avctx, AV_LOG_WARNING, "Failed to copy new extra data: %s\n", av_err2str(ret)); - s->extdata_sent = 0; - } - -@@ -359,8 +511,6 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const - ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); - else if (s->extdata_data) - ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size); -- else -- ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size); - - if (ret == AVERROR(EAGAIN)) { - // Out of input buffers - keep packet -@@ -770,6 +920,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - return ret; - } - -+ if (avctx->extradata && -+ (ret = copy_extradata(avctx, avctx->extradata, avctx->extradata_size, &s->extdata_data, &s->extdata_size)) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to copy extradata from context: %s\n", av_err2str(ret)); -+#if DUMP_FAILED_EXTRADATA -+ log_dump(avctx, AV_LOG_INFO, avctx->extradata, avctx->extradata_size); -+#endif -+ return ret; -+ } -+ - if ((ret = v4l2_prepare_decoder(s)) < 0) - return ret; - - -From 70227ebbc2999bc49075a3b683392d94618ecd89 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Fri, 30 Sep 2022 14:20:23 +0000 -Subject: [PATCH 080/136] v4l2_request_hevc: Fix up - V4L2_CID_CODEC_STATELESS_BASE if missing - ---- - libavcodec/hevc-ctrls-v4.h | 7 +++++++ - 1 file changed, 7 insertions(+) - -diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h -index 7829d82084..c02fdbe5a8 100644 ---- a/libavcodec/hevc-ctrls-v4.h -+++ b/libavcodec/hevc-ctrls-v4.h -@@ -53,6 +53,13 @@ - #include - #include - -+#ifndef V4L2_CTRL_CLASS_CODEC_STATELESS -+#define V4L2_CTRL_CLASS_CODEC_STATELESS 0x00a40000 /* Stateless codecs controls */ -+#endif -+#ifndef V4L2_CID_CODEC_STATELESS_BASE -+#define V4L2_CID_CODEC_STATELESS_BASE (V4L2_CTRL_CLASS_CODEC_STATELESS | 0x900) -+#endif -+ - #define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ - - #define V4L2_CID_STATELESS_HEVC_SPS (V4L2_CID_CODEC_STATELESS_BASE + 400) - -From 22d2000382839dbd04588af1bb20cc9d9b3a4362 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Sat, 1 Oct 2022 13:40:57 +0000 -Subject: [PATCH 081/136] vf_deinterlace_v4l2m2m: Fix compile on m/c without - V4L2 SAND - ---- - libavfilter/vf_deinterlace_v4l2m2m.c | 33 +++++++++++++++++++++++----- - 1 file changed, 28 insertions(+), 5 deletions(-) - -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -index c52dae1c44..716789f988 100644 ---- a/libavfilter/vf_deinterlace_v4l2m2m.c -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -35,6 +35,8 @@ - #include - #include - -+#include "config.h" -+ - #include "libavutil/avassert.h" - #include "libavutil/avstring.h" - #include "libavutil/common.h" -@@ -59,6 +61,16 @@ - #define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */ - #endif - -+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined -+// in drm_fourcc.h hopefully will be sometime in the future but until then... -+#ifndef V4L2_PIX_FMT_NV12_10_COL128 -+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') -+#endif -+ -+#ifndef V4L2_PIX_FMT_NV12_COL128 -+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ -+#endif -+ - typedef struct V4L2Queue V4L2Queue; - typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared; - -@@ -176,9 +188,11 @@ fmt_av_to_v4l2(const enum AVPixelFormat avfmt) - return V4L2_PIX_FMT_YUV420; - case AV_PIX_FMT_NV12: - return V4L2_PIX_FMT_NV12; -+#if CONFIG_SAND - case AV_PIX_FMT_RPI4_8: - case AV_PIX_FMT_SAND128: - return V4L2_PIX_FMT_NV12_COL128; -+#endif - default: - break; - } -@@ -193,8 +207,10 @@ fmt_v4l2_to_av(const uint32_t pixfmt) - return AV_PIX_FMT_YUV420P; - case V4L2_PIX_FMT_NV12: - return AV_PIX_FMT_NV12; -+#if CONFIG_SAND - case V4L2_PIX_FMT_NV12_COL128: - return AV_PIX_FMT_RPI4_8; -+#endif - default: - break; - } -@@ -823,6 +839,7 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame) - h = src->layers[0].planes[1].offset / bpl; - w = bpl; - } -+#if CONFIG_SAND - else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { - if (src->layers[0].nb_planes != 2) - break; -@@ -831,9 +848,11 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame) - h = src->layers[0].planes[1].offset / 128; - bpl = fourcc_mod_broadcom_param(mod); - } -+#endif - break; - - case DRM_FORMAT_P030: -+#if CONFIG_SAND - if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { - if (src->layers[0].nb_planes != 2) - break; -@@ -842,6 +861,7 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame) - h = src->layers[0].planes[1].offset / 128; - bpl = fourcc_mod_broadcom_param(mod); - } -+#endif - break; - - default: -@@ -1048,7 +1068,6 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf) - AVDRMLayerDescriptor * const layer = &drm_desc->layers[0]; - const struct v4l2_format *const fmt = &q->format; - const uint32_t height = fmt_height(fmt); -- const uint32_t width = fmt_width(fmt); - ptrdiff_t bpl0; - - /* fill the DRM frame descriptor */ -@@ -1063,7 +1082,7 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf) - bpl0 = layer->planes[0].pitch; - - switch (fmt_pixelformat(fmt)) { -- -+#if CONFIG_SAND - case V4L2_PIX_FMT_NV12_COL128: - mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0); - layer->format = V4L2_PIX_FMT_NV12; -@@ -1074,9 +1093,10 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf) - layer->nb_planes = 2; - layer->planes[1].object_index = 0; - layer->planes[1].offset = height * 128; -- layer->planes[0].pitch = width; -- layer->planes[1].pitch = width; -+ layer->planes[0].pitch = fmt_width(fmt); -+ layer->planes[1].pitch = layer->planes[0].pitch; - break; -+#endif - - case DRM_FORMAT_NV12: - layer->format = V4L2_PIX_FMT_NV12; -@@ -1576,7 +1596,10 @@ static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc) - return is_linear ? V4L2_PIX_FMT_YUV420 : 0; - case DRM_FORMAT_NV12: - return is_linear ? V4L2_PIX_FMT_NV12 : -- fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 : 0; -+#if CONFIG_SAND -+ fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 : -+#endif -+ 0; - default: - break; - } - -From f06f9ee41bf0f6f74240503f0cb427328cf6792f Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Sun, 2 Oct 2022 12:36:43 +0000 -Subject: [PATCH 082/136] configure: Fix v4l2_req_hevc_vx setup; set after deps - fixups - ---- - configure | 9 +++------ - 1 file changed, 3 insertions(+), 6 deletions(-) - -diff --git a/configure b/configure -index 5c00a183e3..94c8161b91 100755 ---- a/configure -+++ b/configure -@@ -6914,12 +6914,6 @@ fi - check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns - check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;" - disable v4l2_req_hevc_vx --if enabled hevc_v4l2request_hwaccel; then -- enable v4l2_req_hevc_vx --fi --if enabled hevc_v4l2_request; then -- disable v4l2_req_hevc_vx --fi - - check_headers sys/videoio.h - test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete -@@ -7415,6 +7409,9 @@ check_deps $CONFIG_LIST \ - - enabled threads && ! enabled pthreads && ! enabled atomics_native && die "non pthread threading without atomics not supported, try adding --enable-pthreads or --cpu=i486 or higher if you are on x86" - -+# Sub-feature of hevc_v4l2request_hwaccel - can only be set once deps are done -+enabled hevc_v4l2request_hwaccel && disabled hevc_v4l2_request && enable v4l2_req_hevc_vx -+ - case $target_os in - haiku) - disable memalign - -From 7d7709fb68561711f893269227147974fd6a46f3 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Sat, 1 Oct 2022 12:39:45 +0000 -Subject: [PATCH 083/136] vf_deinterlace_v4l2m2m: Ensure we get consistent - final frames - -On getting EOS at the input of the filster do not simply drop everything -in transit on the floor but attempt to retrieve everything possible from -the capture Q before on-signalling EOS. -If we know that we expect 1 frame in to always produce 1 frame out then -match CAPTURE frame to the last OUTPUT frame Qed (scale) -If frames out have an unknown relation to source frames (deinterlace) try -an encode stop and wait for the last frame marker to emerge from CAPTURE ---- - libavfilter/vf_deinterlace_v4l2m2m.c | 172 +++++++++++++++++++++++---- - 1 file changed, 148 insertions(+), 24 deletions(-) - -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -index 716789f988..ce875c2c61 100644 ---- a/libavfilter/vf_deinterlace_v4l2m2m.c -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -94,6 +94,7 @@ typedef struct V4L2Buffer { - typedef struct V4L2Queue { - struct v4l2_format format; - struct v4l2_selection sel; -+ int eos; - int num_buffers; - V4L2Buffer *buffers; - const char * name; -@@ -127,20 +128,41 @@ typedef struct pts_track_s - pts_track_el_t a[PTS_TRACK_SIZE]; - } pts_track_t; - -+typedef enum drain_state_e -+{ -+ DRAIN_NONE = 0, // Not draining -+ DRAIN_TIMEOUT, // Drain until normal timeout setup yields no frame -+ DRAIN_LAST, // Drain with long timeout last_frame in received on output expected -+ DRAIN_EOS, // Drain with long timeout EOS expected -+ DRAIN_DONE // Drained -+} drain_state_t; -+ - typedef struct DeintV4L2M2MContextShared { - void * logctx; // For logging - will be NULL when done - filter_type_v4l2_t filter_type; - - int fd; -- int done; -+ int done; // fd closed - awating all refs dropped - int width; - int height; - -+ int drain; // EOS received (inlink status) -+ drain_state_t drain_state; -+ int64_t drain_pts; // PTS associated with inline status -+ -+ unsigned int frames_rx; -+ unsigned int frames_tx; -+ - // from options - int output_width; - int output_height; - enum AVPixelFormat output_format; - -+ int has_enc_stop; -+ // We expect to get exactly the same number of frames out as we put in -+ // We can drain by matching input to output -+ int one_to_one; -+ - int orig_width; - int orig_height; - atomic_uint refcount; -@@ -179,6 +201,12 @@ typedef struct DeintV4L2M2MContext { - enum AVChromaLocation chroma_location; - } DeintV4L2M2MContext; - -+ -+static inline int drain_frame_expected(const drain_state_t d) -+{ -+ return d == DRAIN_EOS || d == DRAIN_LAST; -+} -+ - // These just list the ones we know we can cope with - static uint32_t - fmt_av_to_v4l2(const enum AVPixelFormat avfmt) -@@ -334,6 +362,13 @@ fail: - return 0; - } - -+// We are only ever expecting in-order frames so nothing more clever is required -+static unsigned int -+pts_track_count(const pts_track_t * const trk) -+{ -+ return (trk->n - trk->last_n) & (PTS_TRACK_SIZE - 1); -+} -+ - static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src) - { - const uint32_t n = pts_track_next_n(trk); -@@ -406,6 +441,12 @@ fmt_pixelformat(const struct v4l2_format * const fmt) - return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat; - } - -+static inline uint32_t -+buf_bytesused0(const struct v4l2_buffer * const buf) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(buf->type) ? buf->m.planes[0].bytesused : buf->bytesused; -+} -+ - static void - init_format(V4L2Queue * const q, const uint32_t format_type) - { -@@ -1469,12 +1510,24 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim - - av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__); - -+ if (queue->eos) { -+ av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: EOS\n", __func__); -+ return AVERROR_EOF; -+ } -+ - avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout); - if (!avbuf) { - av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout); - return AVERROR(EAGAIN); - } - -+ if (V4L2_TYPE_IS_CAPTURE(avbuf->buffer.type)) { -+ if ((avbuf->buffer.flags & V4L2_BUF_FLAG_LAST) != 0) -+ queue->eos = 1; -+ if (buf_bytesused0(&avbuf->buffer) == 0) -+ return queue->eos ? AVERROR_EOF : AVERROR(EINVAL); -+ } -+ - // Fill in PTS and anciliary info from src frame - pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame); - -@@ -1686,6 +1739,20 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) - else - ctx->field_order = V4L2_FIELD_INTERLACED_BT; - -+ { -+ struct v4l2_encoder_cmd ecmd = { -+ .cmd = V4L2_ENC_CMD_STOP -+ }; -+ ctx->has_enc_stop = 0; -+ if (ioctl(ctx->fd, VIDIOC_TRY_ENCODER_CMD, &ecmd) == 0) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop succeeded\n"); -+ ctx->has_enc_stop = 1; -+ } -+ else { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop fail: %s\n", av_err2str(AVERROR(errno))); -+ } -+ -+ } - } - - ret = deint_v4l2m2m_enqueue_frame(output, in); -@@ -1694,6 +1761,41 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) - return ret; - } - -+static int -+ack_inlink(AVFilterContext * const avctx, DeintV4L2M2MContextShared *const s, -+ AVFilterLink * const inlink) -+{ -+ int instatus; -+ int64_t inpts; -+ -+ if (ff_inlink_acknowledge_status(inlink, &instatus, &inpts) <= 0) -+ return 0; -+ -+ s->drain = instatus; -+ s->drain_pts = inpts; -+ s->drain_state = DRAIN_TIMEOUT; -+ -+ if (s->field_order == V4L2_FIELD_ANY) { // Not yet started -+ s->drain_state = DRAIN_DONE; -+ } -+ else if (s->one_to_one) { -+ s->drain_state = DRAIN_LAST; -+ } -+ else if (s->has_enc_stop) { -+ struct v4l2_encoder_cmd ecmd = { -+ .cmd = V4L2_ENC_CMD_STOP -+ }; -+ if (ioctl(s->fd, VIDIOC_ENCODER_CMD, &ecmd) == 0) { -+ av_log(avctx->priv, AV_LOG_DEBUG, "Do Encode stop\n"); -+ s->drain_state = DRAIN_EOS; -+ } -+ else { -+ av_log(avctx->priv, AV_LOG_WARNING, "Encode stop fail: %s\n", av_err2str(AVERROR(errno))); -+ } -+ } -+ return 1; -+} -+ - static int deint_v4l2m2m_activate(AVFilterContext *avctx) - { - DeintV4L2M2MContext * const priv = avctx->priv; -@@ -1702,15 +1804,13 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx) - AVFilterLink * const inlink = avctx->inputs[0]; - int n = 0; - int cn = 99; -- int instatus = 0; -- int64_t inpts = 0; - int did_something = 0; - - av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__); - - FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx); - -- ff_inlink_acknowledge_status(inlink, &instatus, &inpts); -+ ack_inlink(avctx, s, inlink); - - if (!ff_outlink_frame_wanted(outlink)) { - av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__); -@@ -1720,7 +1820,6 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx) - AVFrame * frame = av_frame_alloc(); - int rv; - --again: - recycle_q(&s->output); - n = count_enqueued(&s->output); - -@@ -1729,10 +1828,21 @@ again: - return AVERROR(ENOMEM); - } - -- rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0); -+ rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, -+ drain_frame_expected(s->drain_state) || n > 4 ? 300 : 0); - if (rv != 0) { - av_frame_free(&frame); -- if (rv != AVERROR(EAGAIN)) { -+ if (rv == AVERROR_EOF) { -+ av_log(priv, AV_LOG_DEBUG, "%s: --- DQ EOF\n", __func__); -+ s->drain_state = DRAIN_DONE; -+ } -+ else if (rv == AVERROR(EAGAIN)) { -+ if (s->drain_state != DRAIN_NONE) { -+ av_log(priv, AV_LOG_DEBUG, "%s: --- DQ empty - drain done\n", __func__); -+ s->drain_state = DRAIN_DONE; -+ } -+ } -+ else { - av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv)); - return rv; - } -@@ -1742,29 +1852,30 @@ again: - // frame is always consumed by filter_frame - even on error despite - // a somewhat confusing comment in the header - rv = ff_filter_frame(outlink, frame); -- -- if (instatus != 0) { -- av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__); -- goto again; -- } -+ ++s->frames_tx; - - av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv)); - did_something = 1; -+ -+ if (s->drain_state != DRAIN_NONE && pts_track_count(&s->track) == 0) { -+ av_log(priv, AV_LOG_DEBUG, "%s: --- DQ last - drain done\n", __func__); -+ s->drain_state = DRAIN_DONE; -+ } - } - - cn = count_enqueued(&s->capture); - } - -- if (instatus != 0) { -- ff_outlink_set_status(outlink, instatus, inpts); -- av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus)); -+ if (s->drain_state == DRAIN_DONE) { -+ ff_outlink_set_status(outlink, s->drain, s->drain_pts); -+ av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(s->drain)); - return 0; - } - - recycle_q(&s->output); - n = count_enqueued(&s->output); - -- while (n < 6) { -+ while (n < 6 && !s->drain) { - AVFrame * frame; - int rv; - -@@ -1775,8 +1886,13 @@ again: - - if (frame == NULL) { - av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__); -+ if (!ack_inlink(avctx, s, inlink)) { -+ ff_inlink_request_frame(inlink); -+ av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__); -+ } - break; - } -+ ++s->frames_rx; - - rv = deint_v4l2m2m_filter_frame(inlink, frame); - av_frame_free(&frame); -@@ -1785,16 +1901,11 @@ again: - return rv; - - av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__); -- ++n; -- } -- -- if (n < 6) { -- ff_inlink_request_frame(inlink); - did_something = 1; -- av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__); -+ ++n; - } - -- if (n > 4 && ff_outlink_frame_wanted(outlink)) { -+ if ((n > 4 || s->drain) && ff_outlink_frame_wanted(outlink)) { - ff_filter_set_ready(avctx, 1); - did_something = 1; - av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__); -@@ -1873,7 +1984,18 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) - - static av_cold int scale_v4l2m2m_init(AVFilterContext *avctx) - { -- return common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE); -+ int rv; -+ DeintV4L2M2MContext * priv; -+ DeintV4L2M2MContextShared * ctx; -+ -+ if ((rv = common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE)) != 0) -+ return rv; -+ -+ priv = avctx->priv; -+ ctx = priv->shared; -+ -+ ctx->one_to_one = 1; -+ return 0; - } - - static void deint_v4l2m2m_uninit(AVFilterContext *avctx) -@@ -1881,6 +2003,8 @@ static void deint_v4l2m2m_uninit(AVFilterContext *avctx) - DeintV4L2M2MContext *priv = avctx->priv; - DeintV4L2M2MContextShared *ctx = priv->shared; - -+ av_log(priv, AV_LOG_VERBOSE, "Frames Rx: %u, Frames Tx: %u\n", -+ ctx->frames_rx, ctx->frames_tx); - ctx->done = 1; - ctx->logctx = NULL; // Log to NULL works, log to missing crashes - pts_track_uninit(&ctx->track); - -From f893891df8f4e7738b2d9b49df4386fb160eb25f Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 5 Oct 2022 16:12:02 +0000 -Subject: [PATCH 084/136] v4l2_m2m_dec: Rework decode pending heuristic - -The old code measured the length of the entire Q in the decoder and -attempted to dynamically guess an appropriate length. This was prone to -failure when the guesswork became confused. -The new code attempts to measure the Q length before insertion into decode -which, after all, is what we actually care about. It does this by -asserting that the decoder must have consumed all packets that came -before the one associated with the most recent CAPTURE frame. This -avoids all need for reorder buffer size guesswork. ---- - libavcodec/v4l2_m2m.h | 2 - - libavcodec/v4l2_m2m_dec.c | 77 +++++++++++++++++---------------------- - 2 files changed, 34 insertions(+), 45 deletions(-) - -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index babf101d65..26a7161042 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -107,8 +107,6 @@ typedef struct V4L2m2mContext { - - /* Frame tracking */ - xlat_track_t xlat; -- int pending_hw; -- int pending_n; - - pts_stats_t pts_stat; - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 6bd9926b3f..bec9b22fcf 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -349,41 +349,54 @@ static void - xlat_flush(xlat_track_t * const x) - { - unsigned int i; -+ // Do not reset track_no - this ensures that any frames left in the decoder -+ // that turn up later get discarded. -+ -+ x->last_pts = AV_NOPTS_VALUE; -+ x->last_opaque = 0; - for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) { - x->track_els[i].pending = 0; - x->track_els[i].discard = 1; - } -- x->last_pts = AV_NOPTS_VALUE; -+} -+ -+static void -+xlat_init(xlat_track_t * const x) -+{ -+ memset(x, 0, sizeof(*x)); -+ xlat_flush(x); - } - - static int - xlat_pending(const xlat_track_t * const x) - { - unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE; -- unsigned int i; -- int r = 0; -- int64_t now = AV_NOPTS_VALUE; -+ int i; -+ const int64_t now = x->last_pts; - -- for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) { -+ for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) { - const V4L2m2mTrackEl * const t = x->track_els + n; - -+ // Discard only set on never-set or flushed entries -+ // So if we get here we've never successfully decoded a frame so allow -+ // more frames into the buffer before stalling -+ if (t->discard) -+ return i - 16; -+ -+ // If we've got this frame out then everything before this point -+ // must have entered the decoder - if (!t->pending) -- continue; -+ break; - -+ // If we've never seen a pts all we can do is count frames - if (now == AV_NOPTS_VALUE) -- now = t->dts; -+ continue; - -- if (t->pts == AV_NOPTS_VALUE || -- ((now == AV_NOPTS_VALUE || t->pts <= now) && -- (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts))) -- ++r; -+ if (t->dts != AV_NOPTS_VALUE && now >= t->dts) -+ break; - } - -- // If we never get any ideas about PTS vs DTS allow a lot more buffer -- if (now == AV_NOPTS_VALUE) -- r -= 16; -- -- return r; -+ return i; - } - - static inline int stream_started(const V4L2m2mContext * const s) { -@@ -557,18 +570,6 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx) - return rv; - } - --// Number of frames over what xlat_pending returns that we keep *16 --// This is a min value - if it appears to be too small the threshold should --// adjust dynamically. --#define PENDING_HW_MIN (3 * 16) --// Offset to use when setting dynamically --// Set to %16 == 15 to avoid the threshold changing immediately as we relax --#define PENDING_HW_OFFSET (PENDING_HW_MIN - 1) --// Number of consecutive times we've failed to get a frame when we prefer it --// before we increase the prefer threshold (5ms * N = max expected decode --// time) --#define PENDING_N_THRESHOLD 6 -- - static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - { - V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; -@@ -578,9 +579,11 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - - do { - const int pending = xlat_pending(&s->xlat); -- const int prefer_dq = (pending > s->pending_hw / 16); -+ const int prefer_dq = (pending > 3); - const int last_src_rv = src_rv; - -+ av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt); -+ - // Enqueue another pkt for decode if - // (a) We don't have a lot of stuff in the buffer already OR - // (b) ... we (think we) do but we've failed to get a frame already OR -@@ -625,20 +628,8 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - } - } - -- // Adjust dynamic pending threshold -- if (dst_rv == 0) { -- if (--s->pending_hw < PENDING_HW_MIN) -- s->pending_hw = PENDING_HW_MIN; -- s->pending_n = 0; -- -+ if (dst_rv == 0) - set_best_effort_pts(avctx, &s->pts_stat, frame); -- } -- else if (dst_rv == AVERROR(EAGAIN)) { -- if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) { -- s->pending_hw = pending * 16 + PENDING_HW_OFFSET; -- s->pending_n = 0; -- } -- } - - if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { - av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); -@@ -857,8 +848,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - if (ret < 0) - return ret; - -+ xlat_init(&s->xlat); - pts_stats_init(&s->pts_stat, avctx, "decoder"); -- s->pending_hw = PENDING_HW_MIN; - - capture = &s->capture; - output = &s->output; - -From 7048e7e6b8621cf09b96cc7e44b8d82ba8619913 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Fri, 21 Oct 2022 13:48:07 +0000 -Subject: [PATCH 085/136] pthread_frame: Fix MT hwaccel. Recent change broke - it. - -Revert the effects of 35aa7e70e7ec350319e7634a30d8d8aa1e6ecdda if the -hwaccel is marked MT_SAFE. ---- - libavcodec/pthread_frame.c | 48 ++++++++++++++++++++++++++++---------- - 1 file changed, 36 insertions(+), 12 deletions(-) - -diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c -index 2cc89a41f5..b14f8e9360 100644 ---- a/libavcodec/pthread_frame.c -+++ b/libavcodec/pthread_frame.c -@@ -231,7 +231,7 @@ static attribute_align_arg void *frame_worker_thread(void *arg) - p->hwaccel_serializing = 0; - pthread_mutex_unlock(&p->parent->hwaccel_mutex); - } -- av_assert0(!avctx->hwaccel); -+ av_assert0(!avctx->hwaccel || (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)); - - if (p->async_serializing) { - p->async_serializing = 0; -@@ -319,6 +319,12 @@ FF_ENABLE_DEPRECATION_WARNINGS - } - - dst->hwaccel_flags = src->hwaccel_flags; -+ if (src->hwaccel && -+ (src->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) { -+ dst->hwaccel = src->hwaccel; -+ dst->hwaccel_context = src->hwaccel_context; -+ dst->internal->hwaccel_priv_data = src->internal->hwaccel_priv_data; -+ } - - err = av_buffer_replace(&dst->internal->pool, src->internal->pool); - if (err < 0) -@@ -434,10 +440,13 @@ static int submit_packet(PerThreadContext *p, AVCodecContext *user_avctx, - } - - /* transfer the stashed hwaccel state, if any */ -- av_assert0(!p->avctx->hwaccel); -- FFSWAP(const AVHWAccel*, p->avctx->hwaccel, fctx->stash_hwaccel); -- FFSWAP(void*, p->avctx->hwaccel_context, fctx->stash_hwaccel_context); -- FFSWAP(void*, p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv); -+ av_assert0(!p->avctx->hwaccel || (p->avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)); -+ if (p->avctx->hwaccel && -+ !(p->avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) { -+ FFSWAP(const AVHWAccel*, p->avctx->hwaccel, fctx->stash_hwaccel); -+ FFSWAP(void*, p->avctx->hwaccel_context, fctx->stash_hwaccel_context); -+ FFSWAP(void*, p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv); -+ } - - av_packet_unref(p->avpkt); - ret = av_packet_ref(p->avpkt, avpkt); -@@ -610,9 +619,12 @@ void ff_thread_finish_setup(AVCodecContext *avctx) { - * this is done here so that this worker thread can wipe its own hwaccel - * state after decoding, without requiring synchronization */ - av_assert0(!p->parent->stash_hwaccel); -- p->parent->stash_hwaccel = avctx->hwaccel; -- p->parent->stash_hwaccel_context = avctx->hwaccel_context; -- p->parent->stash_hwaccel_priv = avctx->internal->hwaccel_priv_data; -+ if (avctx->hwaccel && -+ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) { -+ p->parent->stash_hwaccel = avctx->hwaccel; -+ p->parent->stash_hwaccel_context = avctx->hwaccel_context; -+ p->parent->stash_hwaccel_priv = avctx->internal->hwaccel_priv_data; -+ } - - pthread_mutex_lock(&p->progress_mutex); - if(atomic_load(&p->state) == STATE_SETUP_FINISHED){ -@@ -667,6 +679,15 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count) - - park_frame_worker_threads(fctx, thread_count); - -+ if (fctx->prev_thread && -+ avctx->hwaccel && (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) && -+ avctx->internal->hwaccel_priv_data != -+ fctx->prev_thread->avctx->internal->hwaccel_priv_data) { -+ if (update_context_from_thread(avctx, fctx->prev_thread->avctx, 1) < 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to update user thread.\n"); -+ } -+ } -+ - for (i = 0; i < thread_count; i++) { - PerThreadContext *p = &fctx->threads[i]; - AVCodecContext *ctx = p->avctx; -@@ -710,10 +731,13 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count) - - /* if we have stashed hwaccel state, move it to the user-facing context, - * so it will be freed in avcodec_close() */ -- av_assert0(!avctx->hwaccel); -- FFSWAP(const AVHWAccel*, avctx->hwaccel, fctx->stash_hwaccel); -- FFSWAP(void*, avctx->hwaccel_context, fctx->stash_hwaccel_context); -- FFSWAP(void*, avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv); -+ av_assert0(!avctx->hwaccel || (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)); -+ if (avctx->hwaccel && -+ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) { -+ FFSWAP(const AVHWAccel*, avctx->hwaccel, fctx->stash_hwaccel); -+ FFSWAP(void*, avctx->hwaccel_context, fctx->stash_hwaccel_context); -+ FFSWAP(void*, avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv); -+ } - - av_freep(&avctx->internal->thread_ctx); - } - -From 033056bd8ec63b16fe081446f70f41b5d5789b81 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 18 Oct 2022 13:18:27 +0000 -Subject: [PATCH 086/136] v4l2_req: Add swfmt to init logging - -(cherry picked from commit dfa03b702baaf2952bcd2bbf8badcc2f9c961ddf) ---- - libavcodec/v4l2_request_hevc.c | 6 ++++-- - 1 file changed, 4 insertions(+), 2 deletions(-) - -diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c -index 614a1b4d99..767ecb036a 100644 ---- a/libavcodec/v4l2_request_hevc.c -+++ b/libavcodec/v4l2_request_hevc.c -@@ -26,6 +26,7 @@ - #include "v4l2_request_hevc.h" - - #include "libavutil/hwcontext_drm.h" -+#include "libavutil/pixdesc.h" - - #include "v4l2_req_devscan.h" - #include "v4l2_req_dmabufs.h" -@@ -306,10 +307,11 @@ retry_src_memtype: - // Set our s/w format - avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format; - -- av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s\n", -+ av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s; swfmt=%s\n", - ctx->fns->name, - decdev_media_path(decdev), decdev_video_path(decdev), -- mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype)); -+ mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype), -+ av_get_pix_fmt_name(avctx->sw_pix_fmt)); - - return 0; - - -From 70779e742b93015e3e8aaa8f945a12d35917844d Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 18 Oct 2022 13:39:54 +0000 -Subject: [PATCH 087/136] v4l2_m2m: Avoid polling on a queue that is streamoff - -(cherry picked from commit b2658bc56d3034a17db7f39597fc7d71bfe9a43b) ---- - libavcodec/v4l2_context.c | 13 +++++++++---- - 1 file changed, 9 insertions(+), 4 deletions(-) - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index 4a359bf45e..b296dc111c 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -578,6 +578,11 @@ get_event(V4L2m2mContext * const m) - return 0; - } - -+static inline int -+dq_ok(const V4L2Context * const c) -+{ -+ return c->streamon && atomic_load(&c->q_count) != 0; -+} - - // Get a buffer - // If output then just gets the buffer in the expected way -@@ -613,13 +618,13 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout - } - - // If capture && timeout == -1 then also wait for rx buffer free -- if (is_cap && timeout == -1 && m->output.streamon && !m->draining) -+ if (is_cap && timeout == -1 && dq_ok(&m->output) && !m->draining) - pfd.events |= poll_out; - - // If nothing Qed all we will get is POLLERR - avoid that -- if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) || -- (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) || -- (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) { -+ if ((pfd.events == poll_out && !dq_ok(&m->output)) || -+ (pfd.events == poll_cap && !dq_ok(&m->capture)) || -+ (pfd.events == (poll_cap | poll_out) && !dq_ok(&m->capture) && !dq_ok(&m->output))) { - av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name); - return AVERROR(ENOSPC); - } - -From 438fed3702eb689f836c885ebbd813e48d4d4c4a Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 18 Oct 2022 14:07:04 +0000 -Subject: [PATCH 088/136] v4l2_m2m: Add function to get number of queued - buffers - -(cherry picked from commit f9ac6485c00b4531dcff354222aef450b29728f4) ---- - libavcodec/v4l2_context.h | 11 +++++++++++ - 1 file changed, 11 insertions(+) - -diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h -index 523c53e97d..8e4f681643 100644 ---- a/libavcodec/v4l2_context.h -+++ b/libavcodec/v4l2_context.h -@@ -220,4 +220,15 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f); - - void ff_v4l2_dq_all(V4L2Context *const ctx); - -+/** -+ * Returns the number of buffers currently queued -+ * -+ * @param[in] ctx The V4L2Context to evaluate -+ */ -+static inline int -+ff_v4l2_context_q_count(const V4L2Context* const ctx) -+{ -+ return atomic_load(&ctx->q_count); -+} -+ - #endif // AVCODEC_V4L2_CONTEXT_H - -From 95ff4a65ed4c88ea7e02ee55e260e37a0ce2ba88 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 18 Oct 2022 14:48:20 +0000 -Subject: [PATCH 089/136] v4l2_m2m: Add timeouts to dq_all and dequeue_packet - -Add timeouts and use them to have better flow control in encode - -(cherry picked from commit c6173cad7f21697e12887982bda796de9719bb32) ---- - libavcodec/v4l2_context.c | 16 +++++++++++----- - libavcodec/v4l2_context.h | 15 +++++++++++++-- - libavcodec/v4l2_m2m_enc.c | 28 +++++++++++++++++++--------- - 3 files changed, 43 insertions(+), 16 deletions(-) - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index b296dc111c..7031f3d340 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -712,13 +712,19 @@ clean_v4l2_buffer(V4L2Buffer * const avbuf) - return avbuf; - } - --void --ff_v4l2_dq_all(V4L2Context *const ctx) -+int -+ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1) - { - V4L2Buffer * avbuf; -+ if (timeout1 != 0) { -+ int rv = get_qbuf(ctx, &avbuf, timeout1); -+ if (rv != 0) -+ return rv; -+ } - do { - get_qbuf(ctx, &avbuf, 0); - } while (avbuf); -+ return 0; - } - - static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) -@@ -727,7 +733,7 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) - - /* get back as many output buffers as possible */ - if (V4L2_TYPE_IS_OUTPUT(ctx->type)) -- ff_v4l2_dq_all(ctx); -+ ff_v4l2_dq_all(ctx, 0); - - for (i = 0; i < ctx->num_buffers; i++) { - V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -@@ -1047,7 +1053,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) - return 0; - } - --int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) -+int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout) - { - V4L2m2mContext *s = ctx_to_m2mctx(ctx); - AVCodecContext *const avctx = s->avctx; -@@ -1055,7 +1061,7 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) - int rv; - - do { -- if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0) -+ if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) - return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC - if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0) - return rv; -diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h -index 8e4f681643..5afed3e6ec 100644 ---- a/libavcodec/v4l2_context.h -+++ b/libavcodec/v4l2_context.h -@@ -179,7 +179,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd); - * @param[inout] pkt The AVPacket to dequeue to. - * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. - */ --int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); -+int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout); - - /** - * Dequeues a buffer from a V4L2Context to an AVFrame. -@@ -218,7 +218,18 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const - */ - int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f); - --void ff_v4l2_dq_all(V4L2Context *const ctx); -+/** -+ * Dequeue all buffers on this queue -+ * -+ * Used to recycle output buffers -+ * -+ * @param[in] ctx The V4L2Context to dequeue from. -+ * @param[in] timeout1 A timeout on dequeuing the 1st buffer, -+ * all others have a timeout of zero -+ * @return AVERROR(EAGAIN) if timeout1 non-zero then the return -+ * of the first dequeue operation, 0 otherwise. -+ */ -+int ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1); - - /** - * Returns the number of buffers currently queued -diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c -index a992a3cccc..d0d27e5bc2 100644 ---- a/libavcodec/v4l2_m2m_enc.c -+++ b/libavcodec/v4l2_m2m_enc.c -@@ -420,16 +420,24 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) - { - V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; - V4L2Context *const output = &s->output; -+ int rv; -+ int needs_slot = ff_v4l2_context_q_count(output) == output->num_buffers; - -- ff_v4l2_dq_all(output); -+ av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot); - -- // Signal EOF if needed -+ // Signal EOF if needed (doesn't need q slot) - if (!frame) { - return ff_v4l2_context_enqueue_frame(output, frame); - } - -+ if ((rv = ff_v4l2_dq_all(output, needs_slot? 500 : 0)) != 0) { -+ // We should be able to return AVERROR(EAGAIN) to indicate buffer -+ // exhaustion, but ffmpeg currently treats that as fatal. -+ av_log(avctx, AV_LOG_WARNING, "Failed to get buffer for src frame: %s\n", av_err2str(rv)); -+ return rv; -+ } -+ - if (s->input_drm && !output->streamon) { -- int rv; - struct v4l2_format req_format = {.type = output->format.type}; - - // Set format when we first get a buffer -@@ -494,7 +502,9 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) - AVFrame *frame = s->frame; - int ret; - -- ff_v4l2_dq_all(output); -+ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); -+ -+ ff_v4l2_dq_all(output, 0); - - if (s->draining) - goto dequeue; -@@ -532,10 +542,10 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) - } - - dequeue: -- ret = ff_v4l2_context_dequeue_packet(capture, avpkt); -- ff_v4l2_dq_all(output); -+ ret = ff_v4l2_context_dequeue_packet(capture, avpkt, s->draining ? 300 : 0); -+ ff_v4l2_dq_all(output, 0); - if (ret) -- return ret; -+ return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret; - - if (capture->first_buf == 1) { - uint8_t * data; -@@ -566,8 +576,8 @@ dequeue: - s->extdata_size = len; - } - -- ret = ff_v4l2_context_dequeue_packet(capture, avpkt); -- ff_v4l2_dq_all(output); -+ ret = ff_v4l2_context_dequeue_packet(capture, avpkt, 0); -+ ff_v4l2_dq_all(output, 0); - if (ret) - return ret; - } - -From e6654c1997a6f4dfd43b0f74b0168f5d644c1c74 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 18 Oct 2022 14:23:32 +0000 -Subject: [PATCH 090/136] v4l2_m2m_enc: Improve debug trace - -(cherry picked from commit 113e89daffb329a0cd3d920abd483a4025664bf5) ---- - libavcodec/v4l2_m2m_enc.c | 13 ++++++++++--- - 1 file changed, 10 insertions(+), 3 deletions(-) - -diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c -index d0d27e5bc2..c8c2de3d47 100644 ---- a/libavcodec/v4l2_m2m_enc.c -+++ b/libavcodec/v4l2_m2m_enc.c -@@ -427,6 +427,7 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) - - // Signal EOF if needed (doesn't need q slot) - if (!frame) { -+ av_log(avctx, AV_LOG_TRACE, "--- %s: EOS\n", __func__); - return ff_v4l2_context_enqueue_frame(output, frame); - } - -@@ -491,7 +492,12 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) - v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1); - #endif - -- return ff_v4l2_context_enqueue_frame(output, frame); -+ rv = ff_v4l2_context_enqueue_frame(output, frame); -+ if (rv) { -+ av_log(avctx, AV_LOG_ERROR, "Enqueue frame failed: %s\n", av_err2str(rv)); -+ } -+ -+ return rv; - } - - static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) -@@ -502,7 +508,8 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) - AVFrame *frame = s->frame; - int ret; - -- av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); -+ av_log(avctx, AV_LOG_TRACE, "<<< %s: qlen out %d cap %d\n", __func__, -+ ff_v4l2_context_q_count(output), ff_v4l2_context_q_count(capture)); - - ff_v4l2_dq_all(output, 0); - -@@ -615,11 +622,11 @@ dequeue: - avpkt->size = newlen; - } - --// av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret); - capture->first_buf = 0; - return 0; - - fail_no_mem: -+ av_log(avctx, AV_LOG_ERROR, "Rx pkt failed: No memory\n"); - ret = AVERROR(ENOMEM); - av_packet_unref(avpkt); - return ret; - -From 02dca2b845125af7ec6dfb68bdc34726a45fee9c Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 18 Oct 2022 13:22:36 +0000 -Subject: [PATCH 091/136] v4l2_m2m_enc: Copy dest packets to memory if short of - v4l2 buffers - -(cherry picked from commit aa4ebbda400b42db952fc713b26927fc8636b0e5) ---- - libavcodec/v4l2_m2m_enc.c | 16 ++++++++++++++++ - 1 file changed, 16 insertions(+) - -diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c -index c8c2de3d47..c23187e6e6 100644 ---- a/libavcodec/v4l2_m2m_enc.c -+++ b/libavcodec/v4l2_m2m_enc.c -@@ -621,6 +621,22 @@ dequeue: - avpkt->data = buf->data; - avpkt->size = newlen; - } -+ else if (ff_v4l2_context_q_count(capture) < 2) { -+ // Avoid running out of capture buffers -+ // In most cases the buffers will be returned quickly in which case -+ // we don't copy and can use the v4l2 buffers directly but sometimes -+ // ffmpeg seems to hold onto all of them for a long time (.mkv -+ // creation?) so avoid deadlock in those cases. -+ AVBufferRef * const buf = av_buffer_alloc(avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE); -+ if (buf == NULL) -+ goto fail_no_mem; -+ -+ memcpy(buf->data, avpkt->data, avpkt->size); -+ av_buffer_unref(&avpkt->buf); // Will recycle the V4L2 buffer -+ -+ avpkt->buf = buf; -+ avpkt->data = buf->data; -+ } - - capture->first_buf = 0; - return 0; - -From ced9a7d442a04be08fc23e0af310312299a5d5a0 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 19 Oct 2022 11:00:16 +0000 -Subject: [PATCH 092/136] v4l2_m2m_dec: Fix pts_best_effort guessing for - initial pts - -(cherry picked from commit 1af32e5c87586a0f7e76cdf19a012ddbcf3eac67) ---- - libavcodec/v4l2_m2m_dec.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index bec9b22fcf..47b2735f82 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -113,6 +113,8 @@ log_dump(void * logctx, int lvl, const void * const data, const size_t len) - - static int64_t pts_stats_guess(const pts_stats_t * const stats) - { -+ if (stats->last_count <= 1) -+ return stats->last_pts; - if (stats->last_pts == AV_NOPTS_VALUE || - stats->last_interval == 0 || - stats->last_count >= STATS_LAST_COUNT_MAX) - -From 3e3cf6ed7280d8ad4f3eed17a6d18c2df3c0cd31 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 19 Oct 2022 14:47:04 +0000 -Subject: [PATCH 093/136] v4l2_m2m_enc: Wait for frame or space in src Q in - rx_pkt - -If receive_packet we should ensure that there is space in the source Q -if we return EAGAIN so wait for either an output packet or space if -the source Q is currently full. - -(cherry picked from commit 82f0c55782a67a8cc665d937647706c2a75f5548) ---- - libavcodec/v4l2_m2m_enc.c | 22 +++++++++++++++++++--- - 1 file changed, 19 insertions(+), 3 deletions(-) - -diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c -index c23187e6e6..524e9424a5 100644 ---- a/libavcodec/v4l2_m2m_enc.c -+++ b/libavcodec/v4l2_m2m_enc.c -@@ -415,13 +415,17 @@ static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * - return 1; - } - -+static inline int q_full(const V4L2Context *const output) -+{ -+ return ff_v4l2_context_q_count(output) == output->num_buffers; -+} - - static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) - { - V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; - V4L2Context *const output = &s->output; - int rv; -- int needs_slot = ff_v4l2_context_q_count(output) == output->num_buffers; -+ const int needs_slot = q_full(output); - - av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot); - -@@ -549,8 +553,20 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) - } - - dequeue: -- ret = ff_v4l2_context_dequeue_packet(capture, avpkt, s->draining ? 300 : 0); -- ff_v4l2_dq_all(output, 0); -+ // Dequeue a frame -+ for (;;) { -+ int t = q_full(output) ? -1 : s->draining ? 300 : 0; -+ int rv2; -+ -+ // If output is full wait for either a packet or output to become not full -+ ret = ff_v4l2_context_dequeue_packet(capture, avpkt, t); -+ -+ // If output was full retry packet dequeue -+ t = (ret != AVERROR(EAGAIN) || t != -1) ? 0 : 300; -+ rv2 = ff_v4l2_dq_all(output, t); -+ if (t == 0 || rv2 != 0) -+ break; -+ } - if (ret) - return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret; - - -From de9ec2bf6421b199aad9ea9dc7896a46c8813d94 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 19 Oct 2022 14:54:29 +0000 -Subject: [PATCH 094/136] vf_deinterlace_v4l2m2m: Print dts rather that NOPTS - in trace - -(cherry picked from commit e9b468f35f0c6ad9bfe96f5a05e449afa8ae074a) ---- - libavfilter/vf_deinterlace_v4l2m2m.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -index ce875c2c61..7c6751b69c 100644 ---- a/libavfilter/vf_deinterlace_v4l2m2m.c -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -1668,8 +1668,8 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) - V4L2Queue *output = &ctx->output; - int ret; - -- av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n", -- __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den); -+ av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" dts: %"PRId64" field :%d interlaced: %d aspect:%d/%d\n", -+ __func__, in->pts, in->pkt_dts, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den); - av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__, - avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out); - - -From d71a0a173240e18d518ae0b921ac43849524bd66 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 19 Oct 2022 14:55:21 +0000 -Subject: [PATCH 095/136] vf_deinterlace_v4l2m2m: Ignore "wanted" when - processing input - -If we gate send a frame to the outlink on its frame_wanted flag then we -will sometimes stall as the flag may not get set by ffmpeg's filter -processing. So stuff the output whether or not it wants it which works -much better. - -(cherry picked from commit 808254cc04e5e6574cbab9af254b6c2f3d4142e3) ---- - libavfilter/vf_deinterlace_v4l2m2m.c | 5 +---- - 1 file changed, 1 insertion(+), 4 deletions(-) - -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -index 7c6751b69c..a173a291f8 100644 ---- a/libavfilter/vf_deinterlace_v4l2m2m.c -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -1812,10 +1812,7 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx) - - ack_inlink(avctx, s, inlink); - -- if (!ff_outlink_frame_wanted(outlink)) { -- av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__); -- } -- else if (s->field_order != V4L2_FIELD_ANY) // Can't DQ if no setup! -+ if (s->field_order != V4L2_FIELD_ANY) // Can't DQ if no setup! - { - AVFrame * frame = av_frame_alloc(); - int rv; - -From 842e0a00288f9a2a862720990791b8eca9546955 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 19 Oct 2022 15:00:43 +0000 -Subject: [PATCH 096/136] conf_native: Add --enable-gpl - -(cherry picked from commit bab9bf4a2e39391940d88af2ce5d70236ac21f15) ---- - pi-util/conf_native.sh | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh -index f22d531ca4..082d9b5832 100755 ---- a/pi-util/conf_native.sh -+++ b/pi-util/conf_native.sh -@@ -94,6 +94,7 @@ $FFSRC/configure \ - --enable-libdrm\ - --enable-vout-egl\ - --enable-vout-drm\ -+ --enable-gpl\ - $SHARED_LIBS\ - $RPIOPTS\ - --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\ - -From bf9aaf30818308a4651e00a2a64a0f65dc9a36e5 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 15 Nov 2022 13:33:00 +0000 -Subject: [PATCH 097/136] egl_vout: Make formatting consistent - no code - changes - ---- - libavdevice/egl_vout.c | 741 ++++++++++++++++++++--------------------- - 1 file changed, 369 insertions(+), 372 deletions(-) - -diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c -index 7b9c610ace..a52cabb082 100644 ---- a/libavdevice/egl_vout.c -+++ b/libavdevice/egl_vout.c -@@ -48,20 +48,20 @@ - #define TRACE_ALL 0 - - struct egl_setup { -- int conId; -- -- Display *dpy; -- EGLDisplay egl_dpy; -- EGLContext ctx; -- EGLSurface surf; -- Window win; -- -- uint32_t crtcId; -- int crtcIdx; -- uint32_t planeId; -- struct { -- int x, y, width, height; -- } compose; -+ int conId; -+ -+ Display *dpy; -+ EGLDisplay egl_dpy; -+ EGLContext ctx; -+ EGLSurface surf; -+ Window win; -+ -+ uint32_t crtcId; -+ int crtcIdx; -+ uint32_t planeId; -+ struct { -+ int x, y, width, height; -+ } compose; - }; - - typedef struct egl_aux_s { -@@ -70,8 +70,7 @@ typedef struct egl_aux_s { - - } egl_aux_t; - --typedef struct egl_display_env_s --{ -+typedef struct egl_display_env_s { - AVClass *class; - - struct egl_setup setup; -@@ -89,8 +88,8 @@ typedef struct egl_display_env_s - sem_t display_start_sem; - sem_t q_sem; - int q_terminate; -- AVFrame * q_this; -- AVFrame * q_next; -+ AVFrame *q_this; -+ AVFrame *q_next; - - } egl_display_env_t; - -@@ -99,45 +98,44 @@ typedef struct egl_display_env_s - * Remove window border/decorations. - */ - static void --no_border( Display *dpy, Window w) -+no_border(Display *dpy, Window w) - { -- static const unsigned MWM_HINTS_DECORATIONS = (1 << 1); -- static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5; -- -- typedef struct -- { -- unsigned long flags; -- unsigned long functions; -- unsigned long decorations; -- long inputMode; -- unsigned long status; -- } PropMotifWmHints; -- -- PropMotifWmHints motif_hints; -- Atom prop, proptype; -- unsigned long flags = 0; -- -- /* setup the property */ -- motif_hints.flags = MWM_HINTS_DECORATIONS; -- motif_hints.decorations = flags; -- -- /* get the atom for the property */ -- prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True ); -- if (!prop) { -- /* something went wrong! */ -- return; -- } -- -- /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */ -- proptype = prop; -- -- XChangeProperty( dpy, w, /* display, window */ -+ static const unsigned MWM_HINTS_DECORATIONS = (1 << 1); -+ static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5; -+ -+ typedef struct { -+ unsigned long flags; -+ unsigned long functions; -+ unsigned long decorations; -+ long inputMode; -+ unsigned long status; -+ } PropMotifWmHints; -+ -+ PropMotifWmHints motif_hints; -+ Atom prop, proptype; -+ unsigned long flags = 0; -+ -+ /* setup the property */ -+ motif_hints.flags = MWM_HINTS_DECORATIONS; -+ motif_hints.decorations = flags; -+ -+ /* get the atom for the property */ -+ prop = XInternAtom(dpy, "_MOTIF_WM_HINTS", True); -+ if (!prop) { -+ /* something went wrong! */ -+ return; -+ } -+ -+ /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */ -+ proptype = prop; -+ -+ XChangeProperty(dpy, w, /* display, window */ - prop, proptype, /* property, type */ - 32, /* format: 32-bit datums */ - PropModeReplace, /* mode */ -- (unsigned char *) &motif_hints, /* data */ -+ (unsigned char *)&motif_hints, /* data */ - PROP_MOTIF_WM_HINTS_ELEMENTS /* nelements */ -- ); -+ ); - } - - -@@ -146,247 +144,247 @@ no_border( Display *dpy, Window w) - * Return the window and context handles. - */ - static int --make_window(struct AVFormatContext * const s, -- egl_display_env_t * const de, -+make_window(struct AVFormatContext *const s, -+ egl_display_env_t *const de, - Display *dpy, EGLDisplay egl_dpy, const char *name, - Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet) - { -- int scrnum = DefaultScreen( dpy ); -- XSetWindowAttributes attr; -- unsigned long mask; -- Window root = RootWindow( dpy, scrnum ); -- Window win; -- EGLContext ctx; -- const int fullscreen = de->fullscreen; -- EGLConfig config; -- int x = de->window_x; -- int y = de->window_y; -- int width = de->window_width ? de->window_width : 1280; -- int height = de->window_height ? de->window_height : 720; -- -- -- if (fullscreen) { -- int scrnum = DefaultScreen(dpy); -- -- x = 0; y = 0; -- width = DisplayWidth(dpy, scrnum); -- height = DisplayHeight(dpy, scrnum); -- } -- -- { -- EGLint num_configs; -- static const EGLint attribs[] = { -- EGL_RED_SIZE, 1, -- EGL_GREEN_SIZE, 1, -- EGL_BLUE_SIZE, 1, -- EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, -- EGL_NONE -- }; -- -- if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) { -- av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n"); -- return -1; -- } -- } -- -- { -- EGLint vid; -- if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) { -- av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n"); -- return -1; -- } -- -- { -- XVisualInfo visTemplate = { -- .visualid = vid, -- }; -- int num_visuals; -- XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask, -- &visTemplate, &num_visuals); -- -- /* window attributes */ -- attr.background_pixel = 0; -- attr.border_pixel = 0; -- attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone); -- attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask; -- /* XXX this is a bad way to get a borderless window! */ -- mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask; -- -- win = XCreateWindow( dpy, root, x, y, width, height, -- 0, visinfo->depth, InputOutput, -- visinfo->visual, mask, &attr ); -- XFree(visinfo); -- } -- } -- -- if (fullscreen) -- no_border(dpy, win); -- -- /* set hints and properties */ -- { -- XSizeHints sizehints; -- sizehints.x = x; -- sizehints.y = y; -- sizehints.width = width; -- sizehints.height = height; -- sizehints.flags = USSize | USPosition; -- XSetNormalHints(dpy, win, &sizehints); -- XSetStandardProperties(dpy, win, name, name, -- None, (char **)NULL, 0, &sizehints); -- } -- -- eglBindAPI(EGL_OPENGL_ES_API); -- -- { -- static const EGLint ctx_attribs[] = { -- EGL_CONTEXT_CLIENT_VERSION, 2, -- EGL_NONE -- }; -- ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs ); -- if (!ctx) { -- av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); -- return -1; -- } -- } -- -- -- XMapWindow(dpy, win); -- -- { -- EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL); -- if (!surf) { -- av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n"); -- return -1; -- } -- -- if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) { -- av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); -- return -1; -- } -- -- *winRet = win; -- *ctxRet = ctx; -- *surfRet = surf; -- } -- -- return 0; -+ int scrnum = DefaultScreen(dpy); -+ XSetWindowAttributes attr; -+ unsigned long mask; -+ Window root = RootWindow(dpy, scrnum); -+ Window win; -+ EGLContext ctx; -+ const int fullscreen = de->fullscreen; -+ EGLConfig config; -+ int x = de->window_x; -+ int y = de->window_y; -+ int width = de->window_width ? de->window_width : 1280; -+ int height = de->window_height ? de->window_height : 720; -+ -+ -+ if (fullscreen) { -+ int scrnum = DefaultScreen(dpy); -+ -+ x = 0; y = 0; -+ width = DisplayWidth(dpy, scrnum); -+ height = DisplayHeight(dpy, scrnum); -+ } -+ -+ { -+ EGLint num_configs; -+ static const EGLint attribs[] = { -+ EGL_RED_SIZE, 1, -+ EGL_GREEN_SIZE, 1, -+ EGL_BLUE_SIZE, 1, -+ EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, -+ EGL_NONE -+ }; -+ -+ if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) { -+ av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n"); -+ return -1; -+ } -+ } -+ -+ { -+ EGLint vid; -+ if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) { -+ av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n"); -+ return -1; -+ } -+ -+ { -+ XVisualInfo visTemplate = { -+ .visualid = vid, -+ }; -+ int num_visuals; -+ XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask, -+ &visTemplate, &num_visuals); -+ -+ /* window attributes */ -+ attr.background_pixel = 0; -+ attr.border_pixel = 0; -+ attr.colormap = XCreateColormap(dpy, root, visinfo->visual, AllocNone); -+ attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask; -+ /* XXX this is a bad way to get a borderless window! */ -+ mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask; -+ -+ win = XCreateWindow(dpy, root, x, y, width, height, -+ 0, visinfo->depth, InputOutput, -+ visinfo->visual, mask, &attr); -+ XFree(visinfo); -+ } -+ } -+ -+ if (fullscreen) -+ no_border(dpy, win); -+ -+ /* set hints and properties */ -+ { -+ XSizeHints sizehints; -+ sizehints.x = x; -+ sizehints.y = y; -+ sizehints.width = width; -+ sizehints.height = height; -+ sizehints.flags = USSize | USPosition; -+ XSetNormalHints(dpy, win, &sizehints); -+ XSetStandardProperties(dpy, win, name, name, -+ None, (char **)NULL, 0, &sizehints); -+ } -+ -+ eglBindAPI(EGL_OPENGL_ES_API); -+ -+ { -+ static const EGLint ctx_attribs[] = { -+ EGL_CONTEXT_CLIENT_VERSION, 2, -+ EGL_NONE -+ }; -+ ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs); -+ if (!ctx) { -+ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); -+ return -1; -+ } -+ } -+ -+ -+ XMapWindow(dpy, win); -+ -+ { -+ EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL); -+ if (!surf) { -+ av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n"); -+ return -1; -+ } -+ -+ if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) { -+ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); -+ return -1; -+ } -+ -+ *winRet = win; -+ *ctxRet = ctx; -+ *surfRet = surf; -+ } -+ -+ return 0; - } - - static GLint --compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source) -+compile_shader(struct AVFormatContext *const avctx, GLenum target, const char *source) - { -- GLuint s = glCreateShader(target); -+ GLuint s = glCreateShader(target); - -- if (s == 0) { -- av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n"); -- return 0; -- } -+ if (s == 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n"); -+ return 0; -+ } - -- glShaderSource(s, 1, (const GLchar **) &source, NULL); -- glCompileShader(s); -+ glShaderSource(s, 1, (const GLchar **)&source, NULL); -+ glCompileShader(s); - -- { -- GLint ok; -- glGetShaderiv(s, GL_COMPILE_STATUS, &ok); -+ { -+ GLint ok; -+ glGetShaderiv(s, GL_COMPILE_STATUS, &ok); - -- if (!ok) { -- GLchar *info; -- GLint size; -+ if (!ok) { -+ GLchar *info; -+ GLint size; - -- glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size); -- info = malloc(size); -+ glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size); -+ info = malloc(size); - -- glGetShaderInfoLog(s, size, NULL, info); -- av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source); -+ glGetShaderInfoLog(s, size, NULL, info); -+ av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source); - -- return 0; -- } -- } -+ return 0; -+ } -+ } - -- return s; -+ return s; - } - --static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs) -+static GLuint link_program(struct AVFormatContext *const s, GLint vs, GLint fs) - { -- GLuint prog = glCreateProgram(); -- -- if (prog == 0) { -- av_log(s, AV_LOG_ERROR, "Failed to create program\n"); -- return 0; -- } -- -- glAttachShader(prog, vs); -- glAttachShader(prog, fs); -- glLinkProgram(prog); -- -- { -- GLint ok; -- glGetProgramiv(prog, GL_LINK_STATUS, &ok); -- if (!ok) { -- /* Some drivers return a size of 1 for an empty log. This is the size -- * of a log that contains only a terminating NUL character. -- */ -- GLint size; -- GLchar *info = NULL; -- glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size); -- if (size > 1) { -- info = malloc(size); -- glGetProgramInfoLog(prog, size, NULL, info); -- } -+ GLuint prog = glCreateProgram(); - -- av_log(s, AV_LOG_ERROR, "Failed to link: %s\n", -- (info != NULL) ? info : ""); -- return 0; -- } -- } -+ if (prog == 0) { -+ av_log(s, AV_LOG_ERROR, "Failed to create program\n"); -+ return 0; -+ } -+ -+ glAttachShader(prog, vs); -+ glAttachShader(prog, fs); -+ glLinkProgram(prog); -+ -+ { -+ GLint ok; -+ glGetProgramiv(prog, GL_LINK_STATUS, &ok); -+ if (!ok) { -+ /* Some drivers return a size of 1 for an empty log. This is the size -+ * of a log that contains only a terminating NUL character. -+ */ -+ GLint size; -+ GLchar *info = NULL; -+ glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size); -+ if (size > 1) { -+ info = malloc(size); -+ glGetProgramInfoLog(prog, size, NULL, info); -+ } - -- return prog; -+ av_log(s, AV_LOG_ERROR, "Failed to link: %s\n", -+ (info != NULL) ? info : ""); -+ return 0; -+ } -+ } -+ -+ return prog; - } - - static int --gl_setup(struct AVFormatContext * const s) -+gl_setup(struct AVFormatContext *const s) - { -- const char *vs = -- "attribute vec4 pos;\n" -- "varying vec2 texcoord;\n" -- "\n" -- "void main() {\n" -- " gl_Position = pos;\n" -- " texcoord.x = (pos.x + 1.0) / 2.0;\n" -- " texcoord.y = (-pos.y + 1.0) / 2.0;\n" -- "}\n"; -- const char *fs = -- "#extension GL_OES_EGL_image_external : enable\n" -- "precision mediump float;\n" -- "uniform samplerExternalOES s;\n" -- "varying vec2 texcoord;\n" -- "void main() {\n" -- " gl_FragColor = texture2D(s, texcoord);\n" -- "}\n"; -- -- GLuint vs_s; -- GLuint fs_s; -- GLuint prog; -- -- if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) || -- !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) || -- !(prog = link_program(s, vs_s, fs_s))) -- return -1; -- -- glUseProgram(prog); -- -- { -- static const float verts[] = { -- -1, -1, -- 1, -1, -- 1, 1, -- -1, 1, -- }; -- glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts); -- } -- -- glEnableVertexAttribArray(0); -- return 0; -+ const char *vs = -+ "attribute vec4 pos;\n" -+ "varying vec2 texcoord;\n" -+ "\n" -+ "void main() {\n" -+ " gl_Position = pos;\n" -+ " texcoord.x = (pos.x + 1.0) / 2.0;\n" -+ " texcoord.y = (-pos.y + 1.0) / 2.0;\n" -+ "}\n"; -+ const char *fs = -+ "#extension GL_OES_EGL_image_external : enable\n" -+ "precision mediump float;\n" -+ "uniform samplerExternalOES s;\n" -+ "varying vec2 texcoord;\n" -+ "void main() {\n" -+ " gl_FragColor = texture2D(s, texcoord);\n" -+ "}\n"; -+ -+ GLuint vs_s; -+ GLuint fs_s; -+ GLuint prog; -+ -+ if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) || -+ !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) || -+ !(prog = link_program(s, vs_s, fs_s))) -+ return -1; -+ -+ glUseProgram(prog); -+ -+ { -+ static const float verts[] = { -+ -1, -1, -+ 1, -1, -+ 1, 1, -+ -1, 1, -+ }; -+ glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts); -+ } -+ -+ glEnableVertexAttribArray(0); -+ return 0; - } - - static int egl_vout_write_trailer(AVFormatContext *s) -@@ -400,12 +398,12 @@ static int egl_vout_write_trailer(AVFormatContext *s) - - static int egl_vout_write_header(AVFormatContext *s) - { -- const AVCodecParameters * const par = s->streams[0]->codecpar; -+ const AVCodecParameters *const par = s->streams[0]->codecpar; - - #if TRACE_ALL - av_log(s, AV_LOG_INFO, "%s\n", __func__); - #endif -- if ( s->nb_streams > 1 -+ if (s->nb_streams > 1 - || par->codec_type != AVMEDIA_TYPE_VIDEO - || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { - av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); -@@ -416,10 +414,10 @@ static int egl_vout_write_header(AVFormatContext *s) - } - - --static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame) -+static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVFrame *const frame) - { -- const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0]; -- egl_aux_t * da = NULL; -+ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor *)frame->data[0]; -+ egl_aux_t *da = NULL; - unsigned int i; - - #if TRACE_ALL -@@ -440,26 +438,26 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A - - if (da->texture == 0) { - EGLint attribs[50]; -- EGLint * a = attribs; -+ EGLint *a = attribs; - int i, j; - static const EGLint anames[] = { -- EGL_DMA_BUF_PLANE0_FD_EXT, -- EGL_DMA_BUF_PLANE0_OFFSET_EXT, -- EGL_DMA_BUF_PLANE0_PITCH_EXT, -- EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT, -- EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT, -- EGL_DMA_BUF_PLANE1_FD_EXT, -- EGL_DMA_BUF_PLANE1_OFFSET_EXT, -- EGL_DMA_BUF_PLANE1_PITCH_EXT, -- EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT, -- EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT, -- EGL_DMA_BUF_PLANE2_FD_EXT, -- EGL_DMA_BUF_PLANE2_OFFSET_EXT, -- EGL_DMA_BUF_PLANE2_PITCH_EXT, -- EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT, -- EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT, -+ EGL_DMA_BUF_PLANE0_FD_EXT, -+ EGL_DMA_BUF_PLANE0_OFFSET_EXT, -+ EGL_DMA_BUF_PLANE0_PITCH_EXT, -+ EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT, -+ EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT, -+ EGL_DMA_BUF_PLANE1_FD_EXT, -+ EGL_DMA_BUF_PLANE1_OFFSET_EXT, -+ EGL_DMA_BUF_PLANE1_PITCH_EXT, -+ EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT, -+ EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT, -+ EGL_DMA_BUF_PLANE2_FD_EXT, -+ EGL_DMA_BUF_PLANE2_OFFSET_EXT, -+ EGL_DMA_BUF_PLANE2_PITCH_EXT, -+ EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT, -+ EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT, - }; -- const EGLint * b = anames; -+ const EGLint *b = anames; - - *a++ = EGL_WIDTH; - *a++ = av_frame_cropped_width(frame); -@@ -470,8 +468,8 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A - - for (i = 0; i < desc->nb_layers; ++i) { - for (j = 0; j < desc->layers[i].nb_planes; ++j) { -- const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j; -- const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index; -+ const AVDRMPlaneDescriptor *const p = desc->layers[i].planes + j; -+ const AVDRMObjectDescriptor *const obj = desc->objects + p->object_index; - *a++ = *b++; - *a++ = obj->fd; - *a++ = *b++; -@@ -479,13 +477,13 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A - *a++ = *b++; - *a++ = p->pitch; - if (obj->format_modifier == 0) { -- b += 2; -+ b += 2; - } - else { -- *a++ = *b++; -- *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF); -- *a++ = *b++; -- *a++ = (EGLint)(obj->format_modifier >> 32); -+ *a++ = *b++; -+ *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF); -+ *a++ = *b++; -+ *a++ = (EGLint)(obj->format_modifier >> 32); - } - } - } -@@ -494,26 +492,26 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A - - #if TRACE_ALL - for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) { -- av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]); -+ av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]); - } - #endif - { -- const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy, -- EGL_NO_CONTEXT, -- EGL_LINUX_DMA_BUF_EXT, -- NULL, attribs); -- if (!image) { -- av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd); -- return -1; -- } -- -- glGenTextures(1, &da->texture); -- glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); -- glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR); -- glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR); -- glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image); -- -- eglDestroyImageKHR(de->setup.egl_dpy, image); -+ const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy, -+ EGL_NO_CONTEXT, -+ EGL_LINUX_DMA_BUF_EXT, -+ NULL, attribs); -+ if (!image) { -+ av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd); -+ return -1; -+ } -+ -+ glGenTextures(1, &da->texture); -+ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); -+ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR); -+ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR); -+ glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image); -+ -+ eglDestroyImageKHR(de->setup.egl_dpy, image); - } - - da->fd = desc->objects[0].fd; -@@ -540,7 +538,7 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A - (long long)modifiers[1], - (long long)modifiers[2], - (long long)modifiers[3] -- ); -+ ); - #endif - } - -@@ -558,55 +556,55 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A - return 0; - } - --static void * display_thread(void * v) -+static void* display_thread(void *v) - { -- AVFormatContext * const s = v; -- egl_display_env_t * const de = s->priv_data; -+ AVFormatContext *const s = v; -+ egl_display_env_t *const de = s->priv_data; - - #if TRACE_ALL - av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); - #endif - { -- EGLint egl_major, egl_minor; -- -- de->setup.dpy = XOpenDisplay(NULL); -- if (!de->setup.dpy) { -- av_log(s, AV_LOG_ERROR, "Couldn't open X display\n"); -- goto fail; -- } -- -- de->setup.egl_dpy = eglGetDisplay(de->setup.dpy); -- if (!de->setup.egl_dpy) { -- av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n"); -- goto fail; -- } -- -- if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) { -- av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n"); -- goto fail; -- } -- -- av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor); -- -- if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) { -- av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n"); -- goto fail; -- } -+ EGLint egl_major, egl_minor; -+ -+ de->setup.dpy = XOpenDisplay(NULL); -+ if (!de->setup.dpy) { -+ av_log(s, AV_LOG_ERROR, "Couldn't open X display\n"); -+ goto fail; -+ } -+ -+ de->setup.egl_dpy = eglGetDisplay(de->setup.dpy); -+ if (!de->setup.egl_dpy) { -+ av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n"); -+ goto fail; -+ } -+ -+ if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) { -+ av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n"); -+ goto fail; -+ } -+ -+ av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor); -+ -+ if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) { -+ av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n"); -+ goto fail; -+ } - } - - if (!de->window_width || !de->window_height) { -- de->window_width = 1280; -- de->window_height = 720; -+ de->window_width = 1280; -+ de->window_height = 720; - } - if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout", - &de->setup.win, &de->setup.ctx, &de->setup.surf)) { -- av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__); -- goto fail; -+ av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__); -+ goto fail; - } - - if (gl_setup(s)) { -- av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__); -- goto fail; -+ av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__); -+ goto fail; - } - - #if TRACE_ALL -@@ -615,7 +613,7 @@ static void * display_thread(void * v) - sem_post(&de->display_start_sem); - - for (;;) { -- AVFrame * frame; -+ AVFrame *frame; - - while (sem_wait(&de->q_sem) != 0) { - av_assert0(errno == EINTR); -@@ -653,9 +651,9 @@ fail: - - static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt) - { -- const AVFrame * const src_frame = (AVFrame *)pkt->data; -- AVFrame * frame; -- egl_display_env_t * const de = s->priv_data; -+ const AVFrame *const src_frame = (AVFrame *)pkt->data; -+ AVFrame *frame; -+ egl_display_env_t *const de = s->priv_data; - - #if TRACE_ALL - av_log(s, AV_LOG_INFO, "%s\n", __func__); -@@ -668,8 +666,7 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt) - else if (src_frame->format == AV_PIX_FMT_VAAPI) { - frame = av_frame_alloc(); - frame->format = AV_PIX_FMT_DRM_PRIME; -- if (av_hwframe_map(frame, src_frame, 0) != 0) -- { -+ if (av_hwframe_map(frame, src_frame, 0) != 0) { - av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format); - av_frame_free(&frame); - return AVERROR(EINVAL); -@@ -682,12 +679,12 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt) - - // Really hacky sync - while (de->show_all && de->q_next) { -- usleep(3000); -+ usleep(3000); - } - - pthread_mutex_lock(&de->q_lock); - { -- AVFrame * const t = de->q_next; -+ AVFrame *const t = de->q_next; - de->q_next = frame; - frame = t; - } -@@ -702,7 +699,7 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt) - } - - static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, -- unsigned flags) -+ unsigned flags) - { - av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags); - return AVERROR_PATCHWELCOME; -@@ -713,7 +710,7 @@ static int egl_vout_control_message(AVFormatContext *s, int type, void *data, si - #if TRACE_ALL - av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type); - #endif -- switch(type) { -+ switch (type) { - case AV_APP_TO_DEV_WINDOW_REPAINT: - return 0; - default: -@@ -723,14 +720,14 @@ static int egl_vout_control_message(AVFormatContext *s, int type, void *data, si - } - - // deinit is called if init fails so no need to clean up explicity here --static int egl_vout_init(struct AVFormatContext * s) -+static int egl_vout_init(struct AVFormatContext *s) - { -- egl_display_env_t * const de = s->priv_data; -+ egl_display_env_t *const de = s->priv_data; - unsigned int i; - - av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); - -- de->setup = (struct egl_setup){0}; -+ de->setup = (struct egl_setup) { 0 }; - - for (i = 0; i != 32; ++i) { - de->aux[i].fd = -1; -@@ -744,8 +741,8 @@ static int egl_vout_init(struct AVFormatContext * s) - - sem_wait(&de->display_start_sem); - if (de->q_terminate) { -- av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__); -- return -1; -+ av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__); -+ return -1; - } - - av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); -@@ -753,9 +750,9 @@ static int egl_vout_init(struct AVFormatContext * s) - return 0; - } - --static void egl_vout_deinit(struct AVFormatContext * s) -+static void egl_vout_deinit(struct AVFormatContext *s) - { -- egl_display_env_t * const de = s->priv_data; -+ egl_display_env_t *const de = s->priv_data; - - av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); - -@@ -773,11 +770,11 @@ static void egl_vout_deinit(struct AVFormatContext * s) - - #define OFFSET(x) offsetof(egl_display_env_t, x) - static const AVOption options[] = { -- { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, -- { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, -- { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, -- { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, -- { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, { .str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, - { NULL } - - }; - -From 4d3a3973a07994b0a6ec35626e514fc40f439fe3 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 12 Dec 2022 16:49:43 +0000 -Subject: [PATCH 098/136] v4l2m2m: reporganise get_raw_format for loop logic - ---- - libavcodec/v4l2_context.c | 16 +++++----------- - 1 file changed, 5 insertions(+), 11 deletions(-) - -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index 7031f3d340..79a31cf930 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -828,28 +828,22 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) - return 0; - } - -- for (;;) { -+ for (;; ++fdesc.index) { - ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc); - if (ret) - return AVERROR(EINVAL); - - if (priv->pix_fmt != AV_PIX_FMT_NONE) { -- if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) { -- fdesc.index++; -+ if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) - continue; -- } - } - - pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO); - ret = v4l2_try_raw_format(ctx, pixfmt); -- if (ret){ -- fdesc.index++; -- continue; -+ if (ret == 0) { -+ *p = pixfmt; -+ return 0; - } -- -- *p = pixfmt; -- -- return 0; - } - - return AVERROR(EINVAL); - -From 123c5ef429ec6bd7d1875d621df88bb2ad7af0bd Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 12 Dec 2022 17:49:12 +0000 -Subject: [PATCH 099/136] drm_vout: Set zpos on the plane we pick to ensure it - is at the front - ---- - libavdevice/drm_vout.c | 38 +++++++++++++++++++++++++++++++++----- - 1 file changed, 33 insertions(+), 5 deletions(-) - -diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c -index cfb33ce7c3..9bd9e04421 100644 ---- a/libavdevice/drm_vout.c -+++ b/libavdevice/drm_vout.c -@@ -115,9 +115,11 @@ static int find_plane(struct AVFormatContext * const avctx, - { - drmModePlaneResPtr planes; - drmModePlanePtr plane; -+ drmModeObjectPropertiesPtr props = NULL; -+ drmModePropertyPtr prop = NULL; - unsigned int i; - unsigned int j; -- int ret = 0; -+ int ret = -1; - - planes = drmModeGetPlaneResources(drmfd); - if (!planes) -@@ -154,11 +156,37 @@ static int find_plane(struct AVFormatContext * const avctx, - break; - } - -- if (i == planes->count_planes) -- ret = -1; -+ if (i == planes->count_planes) { -+ ret = -1; -+ goto fail; -+ } - -- drmModeFreePlaneResources(planes); -- return ret; -+ props = drmModeObjectGetProperties(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE); -+ if (!props) -+ goto fail; -+ for (i = 0; i != props->count_props; ++i) { -+ if (prop) -+ drmModeFreeProperty(prop); -+ prop = drmModeGetProperty(drmfd, props->props[i]); -+ if (!prop) -+ goto fail; -+ if (strcmp("zpos", prop->name) == 0) { -+ if (drmModeObjectSetProperty(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE, props->props[i], prop->values[1]) == 0) -+ av_log(avctx, AV_LOG_DEBUG, "ZPOS set to %d\n", (int)prop->values[1]); -+ else -+ av_log(avctx, AV_LOG_WARNING, "Failed to set ZPOS on DRM plane\n"); -+ break; -+ } -+ } -+ -+ ret = 0; -+fail: -+ if (props) -+ drmModeFreeObjectProperties(props); -+ if (prop) -+ drmModeFreeProperty(prop); -+ drmModeFreePlaneResources(planes); -+ return ret; - } - - static void da_uninit(drm_display_env_t * const de, drm_aux_t * da) - -From 0ee1c3b41774d05595376f8d25de2a901dbb12c7 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 12 Dec 2022 17:51:46 +0000 -Subject: [PATCH 100/136] drm_vout: Only set modifier flag and pass modifiers - if there are some - ---- - libavdevice/drm_vout.c | 17 ++++++++++++----- - 1 file changed, 12 insertions(+), 5 deletions(-) - -diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c -index 9bd9e04421..a56adea866 100644 ---- a/libavdevice/drm_vout.c -+++ b/libavdevice/drm_vout.c -@@ -34,6 +34,7 @@ - - #include - #include -+#include - - #define TRACE_ALL 0 - -@@ -249,6 +250,7 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A - uint32_t offsets[4] = {0}; - uint64_t modifiers[4] = {0}; - uint32_t bo_handles[4] = {0}; -+ int has_mods = 0; - int i, j, n; - - da->frame = frame; -@@ -258,6 +260,9 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A - av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR); - return -1; - } -+ if (desc->objects[i].format_modifier != DRM_FORMAT_MOD_LINEAR && -+ desc->objects[i].format_modifier != DRM_FORMAT_MOD_INVALID) -+ has_mods = 1; - } - - n = 0; -@@ -299,11 +304,13 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A - #endif - - if (drmModeAddFB2WithModifiers(de->drm_fd, -- av_frame_cropped_width(frame), -- av_frame_cropped_height(frame), -- desc->layers[0].format, bo_handles, -- pitches, offsets, modifiers, -- &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) { -+ av_frame_cropped_width(frame), -+ av_frame_cropped_height(frame), -+ desc->layers[0].format, bo_handles, -+ pitches, offsets, -+ has_mods ? modifiers : NULL, -+ &da->fb_handle, -+ has_mods ? DRM_MODE_FB_MODIFIERS : 0) != 0) { - av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR); - return -1; - } - -From 4534e6981c1718eaeec4c5f58cdf5592ee7f0329 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 12 Dec 2022 17:52:58 +0000 -Subject: [PATCH 101/136] drm_vout: Fix typo in error message - ---- - libavdevice/drm_vout.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c -index a56adea866..351abf1d60 100644 ---- a/libavdevice/drm_vout.c -+++ b/libavdevice/drm_vout.c -@@ -596,7 +596,7 @@ static int drm_vout_init(struct AVFormatContext * s) - sem_init(&de->q_sem_out, 0, 0); - if (pthread_create(&de->q_thread, NULL, display_thread, s)) { - rv = AVERROR(errno); -- av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv)); -+ av_log(s, AV_LOG_ERROR, "Failed to create display thread: %s\n", av_err2str(rv)); - goto fail_close; - } - - -From 0469d1fb132a0d55593611c56e83733efe58045b Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 12 Dec 2022 18:00:41 +0000 -Subject: [PATCH 102/136] drm_vout: Add option to name the drm_module to use - ---- - libavdevice/drm_vout.c | 8 +++++--- - 1 file changed, 5 insertions(+), 3 deletions(-) - -diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c -index 351abf1d60..491e1dc608 100644 ---- a/libavdevice/drm_vout.c -+++ b/libavdevice/drm_vout.c -@@ -70,7 +70,9 @@ typedef struct drm_display_env_s - uint32_t con_id; - struct drm_setup setup; - enum AVPixelFormat avfmt; -+ - int show_all; -+ const char * drm_module; - - unsigned int ano; - drm_aux_t aux[AUX_SIZE]; -@@ -569,7 +571,6 @@ static int drm_vout_init(struct AVFormatContext * s) - { - drm_display_env_t * const de = s->priv_data; - int rv; -- const char * drm_module = DRM_MODULE; - - av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); - -@@ -578,10 +579,10 @@ static int drm_vout_init(struct AVFormatContext * s) - de->setup = (struct drm_setup){0}; - de->q_terminate = 0; - -- if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0) -+ if ((de->drm_fd = drmOpen(de->drm_module, NULL)) < 0) - { - rv = AVERROR(errno); -- av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv)); -+ av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", de->drm_module, av_err2str(rv)); - return rv; - } - -@@ -641,6 +642,7 @@ static void drm_vout_deinit(struct AVFormatContext * s) - #define OFFSET(x) offsetof(drm_display_env_t, x) - static const AVOption options[] = { - { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "drm_module", "drm_module name to use, default=" DRM_MODULE, OFFSET(drm_module), AV_OPT_TYPE_STRING, { .str = DRM_MODULE }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, - { NULL } - }; - - -From 61cb9fc3ce06e0ecaeeec3add143bc3a82956853 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 13 Dec 2022 13:01:00 +0000 -Subject: [PATCH 103/136] dmabufs: Rework to allow for non-CMA backends - ---- - libavcodec/v4l2_req_dmabufs.c | 161 ++++++++++++++++++++++++---------- - 1 file changed, 116 insertions(+), 45 deletions(-) - -diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c -index c4bbed18c6..1c3a5e861f 100644 ---- a/libavcodec/v4l2_req_dmabufs.c -+++ b/libavcodec/v4l2_req_dmabufs.c -@@ -1,3 +1,4 @@ -+#include - #include - #include - #include -@@ -19,9 +20,21 @@ - - #define TRACE_ALLOC 0 - -+struct dmabufs_ctl; -+struct dmabuf_h; -+ -+struct dmabuf_fns { -+ int (*buf_alloc)(struct dmabufs_ctl * dbsc, struct dmabuf_h * dh, size_t size); -+ void (*buf_free)(struct dmabuf_h * dh); -+ int (*ctl_new)(struct dmabufs_ctl * dbsc); -+ void (*ctl_free)(struct dmabufs_ctl * dbsc); -+}; -+ - struct dmabufs_ctl { - int fd; - size_t page_size; -+ void * v; -+ const struct dmabuf_fns * fns; - }; - - struct dmabuf_h { -@@ -29,6 +42,8 @@ struct dmabuf_h { - size_t size; - size_t len; - void * mapptr; -+ void * v; -+ const struct dmabuf_fns * fns; - }; - - #if TRACE_ALLOC -@@ -88,15 +103,8 @@ struct dmabuf_h * dmabuf_import(int fd, size_t size) - struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size) - { - struct dmabuf_h * dh; -- struct dma_heap_allocation_data data = { -- .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1), -- .fd = 0, -- .fd_flags = O_RDWR, -- .heap_flags = 0 -- }; -- - if (old != NULL) { -- if (old->size == data.len) { -+ if (old->size >= size) { - return old; - } - dmabuf_free(old); -@@ -106,24 +114,16 @@ struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * ol - (dh = malloc(sizeof(*dh))) == NULL) - return NULL; - -- while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) { -- int err = errno; -- request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n", -- (uint64_t)data.len, -- dbsc->fd, -- err, -- strerror(err)); -- if (err == EINTR) -- continue; -- goto fail; -- } -- - *dh = (struct dmabuf_h){ -- .fd = data.fd, -- .size = (size_t)data.len, -- .mapptr = MAP_FAILED -+ .fd = -1, -+ .mapptr = MAP_FAILED, -+ .fns = dbsc->fns - }; - -+ if (dh->fns->buf_alloc(dbsc, dh, size) != 0) -+ goto fail; -+ -+ - #if TRACE_ALLOC - ++total_bufs; - total_size += dh->size; -@@ -220,8 +220,6 @@ void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len) - dh->len = len; - } - -- -- - void dmabuf_free(struct dmabuf_h * dh) - { - if (!dh) -@@ -233,20 +231,63 @@ void dmabuf_free(struct dmabuf_h * dh) - request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); - #endif - -- if (dh->mapptr != MAP_FAILED) -+ dh->fns->buf_free(dh); -+ -+ if (dh->mapptr != MAP_FAILED && dh->mapptr != NULL) - munmap(dh->mapptr, dh->size); -- while (close(dh->fd) == -1 && errno == EINTR) -- /* loop */; -+ if (dh->fd != -1) -+ while (close(dh->fd) == -1 && errno == EINTR) -+ /* loop */; - free(dh); - } - --struct dmabufs_ctl * dmabufs_ctl_new(void) -+static struct dmabufs_ctl * dmabufs_ctl_new2(const struct dmabuf_fns * const fns) - { -- struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc)); -+ struct dmabufs_ctl * dbsc = calloc(1, sizeof(*dbsc)); - - if (!dbsc) - return NULL; - -+ dbsc->fd = -1; -+ dbsc->fns = fns; -+ dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE); -+ -+ if (fns->ctl_new(dbsc) != 0) -+ goto fail; -+ -+ return dbsc; -+ -+fail: -+ free(dbsc); -+ return NULL; -+} -+ -+static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc) -+{ -+ request_debug(NULL, "Free dmabuf ctl\n"); -+ -+ dbsc->fns->ctl_free(dbsc); -+ -+ free(dbsc); -+} -+ -+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc) -+{ -+ struct dmabufs_ctl * const dbsc = *pDbsc; -+ -+ if (!dbsc) -+ return; -+ *pDbsc = NULL; -+ -+ dmabufs_ctl_free(dbsc); -+} -+ -+//----------------------------------------------------------------------------- -+// -+// Alloc dmabuf via CMA -+ -+static int ctl_cma_new(struct dmabufs_ctl * dbsc) -+{ - while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 && - errno == EINTR) - /* Loop */; -@@ -258,31 +299,61 @@ struct dmabufs_ctl * dmabufs_ctl_new(void) - if (dbsc->fd == -1) { - request_log("Unable to open either %s or %s\n", - DMABUF_NAME1, DMABUF_NAME2); -- goto fail; -+ return -1; - } - } -+ return 0; -+} - -- dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE); -- -- return dbsc; -+static void ctl_cma_free(struct dmabufs_ctl * dbsc) -+{ -+ if (dbsc->fd != -1) -+ while (close(dbsc->fd) == -1 && errno == EINTR) -+ /* loop */; - --fail: -- free(dbsc); -- return NULL; - } - --void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc) -+static int buf_cma_alloc(struct dmabufs_ctl * const dbsc, struct dmabuf_h * dh, size_t size) - { -- struct dmabufs_ctl * const dbsc = *pDbsc; -+ struct dma_heap_allocation_data data = { -+ .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1), -+ .fd = 0, -+ .fd_flags = O_RDWR, -+ .heap_flags = 0 ++ /* Source buffers are only as big as needed, since any over-read won't affect results */ ++ LOCAL_ALIGNED_16(int16_t, src0, [64]); ++ LOCAL_ALIGNED_16(int16_t, src1, [64]); ++ /* Destination buffers have borders of one row above/below and 8 columns left/right to catch overflows */ ++ LOCAL_ALIGNED_8(uint8_t, dst0, [10 * 24]); ++ LOCAL_ALIGNED_8(uint8_t, dst1, [10 * 24]); ++ ++ AVCodecContext avctx = { 0 }; ++ IDCTDSPContext h; ++ ++ const test tests[] = { ++ IDCTDSP_TEST(add_pixels_clamped) ++ IDCTDSP_TEST(put_pixels_clamped) ++ IDCTDSP_TEST(put_signed_pixels_clamped) + }; - -- if (!dbsc) -- return; -- *pDbsc = NULL; -+ while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) { -+ int err = errno; -+ request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n", -+ (uint64_t)data.len, -+ dbsc->fd, -+ err, -+ strerror(err)); -+ if (err == EINTR) -+ continue; -+ return -err; -+ } - -- while (close(dbsc->fd) == -1 && errno == EINTR) -- /* loop */; -+ dh->fd = data.fd; -+ dh->size = (size_t)data.len; -+ return 0; -+} - -- free(dbsc); -+static void buf_cma_free(struct dmabuf_h * dh) -+{ -+ // Nothing needed - } - -+static const struct dmabuf_fns dmabuf_cma_fns = { -+ .buf_alloc = buf_cma_alloc, -+ .buf_free = buf_cma_free, -+ .ctl_new = ctl_cma_new, -+ .ctl_free = ctl_cma_free, -+}; + -+struct dmabufs_ctl * dmabufs_ctl_new(void) -+{ -+ request_debug(NULL, "Dmabufs using CMA\n");; -+ return dmabufs_ctl_new2(&dmabuf_cma_fns); -+} - - -From 288807720443bbddf4c83c3589d1877c7fd418c3 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 13 Dec 2022 13:07:58 +0000 -Subject: [PATCH 104/136] dmabufs: Use unref rather than deleet on cmabufs_ctl - ---- - libavcodec/v4l2_req_dmabufs.c | 12 +++++++++++- - libavcodec/v4l2_req_dmabufs.h | 3 ++- - libavcodec/v4l2_request_hevc.c | 4 ++-- - 3 files changed, 15 insertions(+), 4 deletions(-) - -diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c -index 1c3a5e861f..acc0366e76 100644 ---- a/libavcodec/v4l2_req_dmabufs.c -+++ b/libavcodec/v4l2_req_dmabufs.c -@@ -31,6 +31,7 @@ struct dmabuf_fns { - }; - - struct dmabufs_ctl { -+ atomic_int ref_count; - int fd; - size_t page_size; - void * v; -@@ -271,7 +272,7 @@ static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc) - free(dbsc); - } - --void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc) -+void dmabufs_ctl_unref(struct dmabufs_ctl ** const pDbsc) - { - struct dmabufs_ctl * const dbsc = *pDbsc; - -@@ -279,9 +280,18 @@ void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc) - return; - *pDbsc = NULL; - -+ if (atomic_fetch_sub(&dbsc->ref_count, 1) != 0) -+ return; ++ ff_idctdsp_init(&h, &avctx); + - dmabufs_ctl_free(dbsc); - } - -+struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc) -+{ -+ atomic_fetch_add(&dbsc->ref_count, 1); -+ return dbsc; -+} -+ - //----------------------------------------------------------------------------- - // - // Alloc dmabuf via CMA -diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h -index c1d3d8c8d7..381ba2708d 100644 ---- a/libavcodec/v4l2_req_dmabufs.h -+++ b/libavcodec/v4l2_req_dmabufs.h -@@ -7,7 +7,8 @@ struct dmabufs_ctl; - struct dmabuf_h; - - struct dmabufs_ctl * dmabufs_ctl_new(void); --void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc); -+void dmabufs_ctl_unref(struct dmabufs_ctl ** const pdbsc); -+struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc); - - // Need not preserve old contents - // On NULL return old buffer is freed -diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c -index 767ecb036a..db7ed13b6d 100644 ---- a/libavcodec/v4l2_request_hevc.c -+++ b/libavcodec/v4l2_request_hevc.c -@@ -105,7 +105,7 @@ static int v4l2_request_hevc_uninit(AVCodecContext *avctx) - mediabufs_ctl_unref(&ctx->mbufs); - media_pool_delete(&ctx->mpool); - pollqueue_unref(&ctx->pq); -- dmabufs_ctl_delete(&ctx->dbufs); -+ dmabufs_ctl_unref(&ctx->dbufs); - devscan_delete(&ctx->devscan); - - decode_q_uninit(&ctx->decode_q); -@@ -324,7 +324,7 @@ fail3: - fail2: - pollqueue_unref(&ctx->pq); - fail1: -- dmabufs_ctl_delete(&ctx->dbufs); -+ dmabufs_ctl_unref(&ctx->dbufs); - fail0: - devscan_delete(&ctx->devscan); - return ret; - -From 9115f40c5f55873102312085f2e328d1a2101ae4 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 13 Dec 2022 14:21:40 +0000 -Subject: [PATCH 105/136] egl_vout: Remove redundant & completely broken debug - ---- - libavdevice/egl_vout.c | 25 ------------------------- - 1 file changed, 25 deletions(-) - -diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c -index a52cabb082..afc7afd13e 100644 ---- a/libavdevice/egl_vout.c -+++ b/libavdevice/egl_vout.c -@@ -515,31 +515,6 @@ static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVF - } - - da->fd = desc->objects[0].fd; -- --#if 0 -- av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d," -- " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n", -- av_frame_cropped_width(frame), -- av_frame_cropped_height(frame), -- desc->layers[0].format, -- bo_plane_handles[0], -- bo_plane_handles[1], -- bo_plane_handles[2], -- bo_plane_handles[3], -- pitches[0], -- pitches[1], -- pitches[2], -- pitches[3], -- offsets[0], -- offsets[1], -- offsets[2], -- offsets[3], -- (long long)modifiers[0], -- (long long)modifiers[1], -- (long long)modifiers[2], -- (long long)modifiers[3] -- ); --#endif - } - - glClearColor(0.5, 0.5, 0.5, 0.5); - -From 34711d5a1429213b6f4cf8ad163e8e8d108626e7 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 13 Dec 2022 16:12:12 +0000 -Subject: [PATCH 106/136] v4l2m2m: Use offset from querybuf rather than always - 0 - ---- - libavcodec/v4l2_buffers.c | 4 +++- - libavcodec/v4l2_buffers.h | 3 ++- - 2 files changed, 5 insertions(+), 2 deletions(-) - -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 9ef2f40e39..5ca58ea593 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -379,7 +379,7 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf) - - for (int i = 0; i < avbuf->num_planes; i++) { - layer->planes[i].object_index = i; -- layer->planes[i].offset = 0; -+ layer->planes[i].offset = avbuf->plane_info[i].offset; - layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; - } - -@@ -934,6 +934,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct - - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length; -+ avbuf->plane_info[i].offset = avbuf->buf.m.planes[i].data_offset; - - if (want_mmap) - avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, -@@ -941,6 +942,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct - buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); - } else { - avbuf->plane_info[i].length = avbuf->buf.length; -+ avbuf->plane_info[i].offset = 0; - - if (want_mmap) - avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, -diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h -index 1ac32c5989..d91d5d1dd0 100644 ---- a/libavcodec/v4l2_buffers.h -+++ b/libavcodec/v4l2_buffers.h -@@ -66,7 +66,8 @@ typedef struct V4L2Buffer { - - /* keep track of the mmap address and mmap length */ - struct V4L2Plane_info { -- int bytesperline; -+ size_t bytesperline; -+ size_t offset; - void * mm_addr; - size_t length; - } plane_info[VIDEO_MAX_PLANES]; - -From 15458be3fe79c14f4fdcc2ad786508d1b647c914 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 13 Dec 2022 17:57:27 +0000 -Subject: [PATCH 107/136] v4l2m2m: Fix crash if init errors out before setting - avctx - ---- - libavcodec/v4l2_m2m.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c -index 1e30d15fd8..ac6bae0dc3 100644 ---- a/libavcodec/v4l2_m2m.c -+++ b/libavcodec/v4l2_m2m.c -@@ -278,7 +278,7 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) - - av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n"); - -- if (av_codec_is_decoder(s->avctx->codec)) -+ if (s->avctx && av_codec_is_decoder(s->avctx->codec)) - av_packet_unref(&s->buf_pkt); - - if (s->fd >= 0) { - -From 9f7f94c680b8aaedede9b3bcad37b645216cfcff Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 13 Dec 2022 18:10:30 +0000 -Subject: [PATCH 108/136] v4l2_buffers: Add and use ctx_to_m2mctx + error debug - ---- - libavcodec/v4l2_buffers.c | 22 +++++++++++++++------- - 1 file changed, 15 insertions(+), 7 deletions(-) - -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 5ca58ea593..e28ef2d1e8 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -41,11 +41,16 @@ - #define USEC_PER_SEC 1000000 - static const AVRational v4l2_timebase = { 1, USEC_PER_SEC }; - -+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx) -+{ -+ return V4L2_TYPE_IS_OUTPUT(ctx->type) ? -+ container_of(ctx, V4L2m2mContext, output) : -+ container_of(ctx, V4L2m2mContext, capture); -+} -+ - static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf) - { -- return V4L2_TYPE_IS_OUTPUT(buf->context->type) ? -- container_of(buf->context, V4L2m2mContext, output) : -- container_of(buf->context, V4L2m2mContext, capture); -+ return ctx_to_m2mctx(buf->context); - } - - static inline AVCodecContext *logger(const V4L2Buffer * const buf) -@@ -883,6 +888,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct - int ret, i; - V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf)); - AVBufferRef * bufref; -+ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); - - *pbufref = NULL; - if (avbuf == NULL) -@@ -910,7 +916,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct - avbuf->buf.m.planes = avbuf->planes; - } - -- ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf); -+ ret = ioctl(s->fd, VIDIOC_QUERYBUF, &avbuf->buf); - if (ret < 0) - goto fail; - -@@ -969,10 +975,12 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct - } - - if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { -- if (buf_to_m2mctx(avbuf)->output_drm) { -+ if (s->output_drm) { - ret = v4l2_buffer_export_drm(avbuf); -- if (ret) -- goto fail; -+ if (ret) { -+ av_log(logger(avbuf), AV_LOG_ERROR, "Failed to get exported drm handles\n"); -+ goto fail; -+ } - } - } - - -From 6b8bb2c41828351cd3a6f40be353696ae36450b7 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 13 Dec 2022 18:53:22 +0000 -Subject: [PATCH 109/136] v4l2m2m: Add ability to use cma alloced dmabufs as - well as v4l2 mmap - ---- - libavcodec/Makefile | 2 +- - libavcodec/v4l2_buffers.c | 65 ++++++++++++++++++++++++++------------- - libavcodec/v4l2_buffers.h | 2 ++ - libavcodec/v4l2_m2m.c | 6 +++- - libavcodec/v4l2_m2m.h | 4 +++ - libavcodec/v4l2_m2m_dec.c | 16 ++++++++++ - 6 files changed, 71 insertions(+), 24 deletions(-) - -diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index 11f183c9b9..8b1d669834 100644 ---- a/libavcodec/Makefile -+++ b/libavcodec/Makefile -@@ -170,7 +170,7 @@ OBJS-$(CONFIG_VP3DSP) += vp3dsp.o - OBJS-$(CONFIG_VP56DSP) += vp56dsp.o - OBJS-$(CONFIG_VP8DSP) += vp8dsp.o - OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\ -- weak_link.o -+ weak_link.o v4l2_req_dmabufs.o - OBJS-$(CONFIG_V4L2_REQUEST) += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\ - v4l2_req_devscan.o weak_link.o - OBJS-$(CONFIG_WMA_FREQS) += wma_freqs.o -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index e28ef2d1e8..8d80d19788 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -36,6 +36,7 @@ - #include "v4l2_context.h" - #include "v4l2_buffers.h" - #include "v4l2_m2m.h" -+#include "v4l2_req_dmabufs.h" - #include "weak_link.h" - - #define USEC_PER_SEC 1000000 -@@ -477,33 +478,46 @@ static void v4l2_free_bufref(void *opaque, uint8_t *data) - av_buffer_unref(&bufref); - } - -+static inline uint32_t ff_v4l2_buf_len(const struct v4l2_buffer * b, unsigned int i) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(b->type) ? b->m.planes[i].length : b->length; -+} -+ - static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) - { -- struct v4l2_exportbuffer expbuf; - int i, ret; -+ const V4L2m2mContext * const s = buf_to_m2mctx(avbuf); - - for (i = 0; i < avbuf->num_planes; i++) { -- memset(&expbuf, 0, sizeof(expbuf)); -- -- expbuf.index = avbuf->buf.index; -- expbuf.type = avbuf->buf.type; -- expbuf.plane = i; -+ int dma_fd = -1; -+ const uint32_t blen = ff_v4l2_buf_len(&avbuf->buf, i); -+ -+ if (s->db_ctl != NULL) { -+ if ((avbuf->dmabuf[i] = dmabuf_alloc(s->db_ctl, blen)) == NULL) -+ return AVERROR(ENOMEM); -+ dma_fd = dmabuf_fd(avbuf->dmabuf[i]); -+ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) -+ avbuf->buf.m.planes[i].m.fd = dma_fd; -+ else -+ avbuf->buf.m.fd = dma_fd; -+ } -+ else { -+ struct v4l2_exportbuffer expbuf; -+ memset(&expbuf, 0, sizeof(expbuf)); - -- ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf); -- if (ret < 0) -- return AVERROR(errno); -+ expbuf.index = avbuf->buf.index; -+ expbuf.type = avbuf->buf.type; -+ expbuf.plane = i; - -- if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) { -- /* drm frame */ -- avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length; -- avbuf->drm_frame.objects[i].fd = expbuf.fd; -- avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; -- } else { -- /* drm frame */ -- avbuf->drm_frame.objects[0].size = avbuf->buf.length; -- avbuf->drm_frame.objects[0].fd = expbuf.fd; -- avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ ret = ioctl(s->fd, VIDIOC_EXPBUF, &expbuf); -+ if (ret < 0) -+ return AVERROR(errno); -+ dma_fd = expbuf.fd; - } -+ -+ avbuf->drm_frame.objects[i].size = blen; -+ avbuf->drm_frame.objects[i].fd = dma_fd; -+ avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; - } - - return 0; -@@ -870,9 +884,16 @@ static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data) - munmap(p->mm_addr, p->length); - } - -- for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { -- if (avbuf->drm_frame.objects[i].fd != -1) -- close(avbuf->drm_frame.objects[i].fd); -+ if (avbuf->dmabuf[0] == NULL) { -+ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { -+ if (avbuf->drm_frame.objects[i].fd != -1) -+ close(avbuf->drm_frame.objects[i].fd); ++ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { ++ void (*func)(const int16_t *, uint8_t * ptrdiff_t) = *(void **)((intptr_t) &h + tests[t].offset); ++ if (check_func(func, "idctdsp.%s", tests[t].name)) { ++ declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *, uint8_t *, ptrdiff_t); ++ RANDOMIZE_BUFFER16(src, 64); ++ RANDOMIZE_BUFFER8(dst, 10 * 24); ++ call_ref(src0, dst0 + 24 + 8, 24); ++ call_new(src1, dst1 + 24 + 8, 24); ++ if (memcmp(dst0, dst1, 10 * 24)) ++ fail(); ++ bench_new(src1, dst1 + 24 + 8, 24); + } + } -+ else { -+ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->dmabuf); ++i) { -+ dmabuf_free(avbuf->dmabuf[i]); -+ } - } - - av_buffer_unref(&avbuf->ref_buf); -diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h -index d91d5d1dd0..444ad94b14 100644 ---- a/libavcodec/v4l2_buffers.h -+++ b/libavcodec/v4l2_buffers.h -@@ -46,6 +46,7 @@ enum V4L2Buffer_status { - */ - struct V4L2Context; - struct ff_weak_link_client; -+struct dmabuf_h; - - typedef struct V4L2Buffer { - /* each buffer needs to have a reference to its context -@@ -80,6 +81,7 @@ typedef struct V4L2Buffer { - - enum V4L2Buffer_status status; - -+ struct dmabuf_h * dmabuf[VIDEO_MAX_PLANES]; // If externally alloced dmabufs - stash other info here - } V4L2Buffer; - - /** -diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c -index ac6bae0dc3..f802687b1b 100644 ---- a/libavcodec/v4l2_m2m.c -+++ b/libavcodec/v4l2_m2m.c -@@ -34,6 +34,7 @@ - #include "v4l2_context.h" - #include "v4l2_fmt.h" - #include "v4l2_m2m.h" -+#include "v4l2_req_dmabufs.h" - - static void - xlat_init(xlat_track_t * const x) -@@ -75,7 +76,7 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe) - - s->capture.done = s->output.done = 0; - s->capture.name = "capture"; -- s->capture.buf_mem = V4L2_MEMORY_MMAP; -+ s->capture.buf_mem = s->db_ctl != NULL ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; - s->output.name = "output"; - s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; - atomic_init(&s->refcount, 0); -@@ -94,12 +95,14 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe) - if (v4l2_mplane_video(&cap)) { - s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; - s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; -+ s->output.format.type = s->output.type; - return 0; - } - - if (v4l2_splane_video(&cap)) { - s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; - s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; -+ s->output.format.type = s->output.type; - return 0; - } - -@@ -293,6 +296,7 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) - - ff_v4l2_context_release(&s->output); - -+ dmabufs_ctl_unref(&s->db_ctl); - close(s->fd); - s->fd = -1; - -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index 26a7161042..0f41f94694 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -71,6 +71,8 @@ typedef struct xlat_track_s { - V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; - } xlat_track_t; - -+struct dmabufs_ctl; ++} + - typedef struct V4L2m2mContext { - char devname[PATH_MAX]; - int fd; -@@ -124,6 +126,7 @@ typedef struct V4L2m2mContext { - /* Quirks */ - unsigned int quirks; - -+ struct dmabufs_ctl * db_ctl; - } V4L2m2mContext; - - typedef struct V4L2m2mPriv { -@@ -134,6 +137,7 @@ typedef struct V4L2m2mPriv { - - int num_output_buffers; - int num_capture_buffers; -+ const char * dmabuf_alloc; - enum AVPixelFormat pix_fmt; - } V4L2m2mPriv; - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 47b2735f82..4d17057298 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -41,6 +41,7 @@ - #include "v4l2_context.h" - #include "v4l2_m2m.h" - #include "v4l2_fmt.h" -+#include "v4l2_req_dmabufs.h" - - // Pick 64 for max last count - that is >1sec at 60fps - #define STATS_LAST_COUNT_MAX 64 -@@ -896,6 +897,20 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - s->output_drm = 0; - } - -+ s->db_ctl = NULL; -+ if (priv->dmabuf_alloc != NULL && strcmp(priv->dmabuf_alloc, "v4l2") != 0) { -+ if (strcmp(priv->dmabuf_alloc, "cma") == 0) -+ s->db_ctl = dmabufs_ctl_new(); -+ else { -+ av_log(avctx, AV_LOG_ERROR, "Unknown dmabuf alloc method: '%s'\n", priv->dmabuf_alloc); -+ return AVERROR(EINVAL); -+ } -+ if (!s->db_ctl) { -+ av_log(avctx, AV_LOG_ERROR, "Can't open dmabuf provider '%s'\n", priv->dmabuf_alloc); -+ return AVERROR(ENOMEM); -+ } -+ } -+ - s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM); - if (!s->device_ref) { - ret = AVERROR(ENOMEM); -@@ -1000,6 +1015,7 @@ static const AVOption options[] = { - { "num_capture_buffers", "Number of buffers in the capture context", - OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS }, - { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS }, -+ { "dmabuf_alloc", "Dmabuf alloc method", OFFSET(dmabuf_alloc), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS }, - { NULL}, - }; - - -From 499bcdc4ed82c737ceab166a07b46e8ed8ccbc88 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 13 Dec 2022 19:05:47 +0000 -Subject: [PATCH 110/136] testfilt: Skeleton of hw filter test code - ---- - pi-util/testfilt.py | 83 +++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 83 insertions(+) - create mode 100755 pi-util/testfilt.py - -diff --git a/pi-util/testfilt.py b/pi-util/testfilt.py -new file mode 100755 -index 0000000000..b322dac0c2 ++void checkasm_check_idctdsp(void) ++{ ++ check_add_put_clamped(); ++ report("idctdsp"); ++} +diff --git a/tests/checkasm/rpi_sand.c b/tests/checkasm/rpi_sand.c +new file mode 100644 +index 0000000000..0888714c4c --- /dev/null -+++ b/pi-util/testfilt.py -@@ -0,0 +1,83 @@ -+#!/usr/bin/env python3 ++++ b/tests/checkasm/rpi_sand.c +@@ -0,0 +1,118 @@ ++/* ++ * Copyright (c) 2023 John Cox ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with FFmpeg; if not, write to the Free Software Foundation, Inc., ++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ++ */ + -+import string -+import os -+import subprocess -+import re -+import argparse -+import sys -+import csv -+from stat import * ++#include ++#include "checkasm.h" ++#include "libavutil/common.h" ++#include "libavutil/rpi_sand_fns.h" + -+class validator: -+ def __init__(self): -+ self.ok = False -+ -+ def isok(self): -+ return self.ok -+ -+ def setok(self): -+ self.ok = True -+ -+class valid_regex(validator): -+ def __init__(self, regex): -+ super().__init__() -+ self.regex = re.compile(regex) -+ -+ def scanline(self, line): -+ if self.isok() or self.regex.search(line): -+ self.setok() -+ -+ -+def validate(validators, flog): -+ for line in flog: -+ for v in validators: -+ v.scanline(line) -+ -+ ok = True -+ for v in validators: -+ if not v.isok(): -+ ok = False -+ # complain -+ print("Test failed") -+ -+ if ok: -+ print("OK") -+ return ok -+ -+def runtest(name, ffmpeg, args, suffix, validators): -+ log_root = os.path.join("/tmp", "testfilt", name) -+ ofilename = os.path.join(log_root, name + suffix) -+ -+ if not os.path.exists(log_root): -+ os.makedirs(log_root) -+ -+ try: -+ os.remove(ofilename) -+ except: -+ pass -+ -+ flog = open(os.path.join(log_root, name + ".log"), "wb") -+ ffargs = [ffmpeg] + args + [ofilename] -+ -+ subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT, text=False) -+ flog.close -+ -+ flog = open(os.path.join(log_root, name + ".log"), "rt") -+ return validate(validators, flog) -+ -+def sayok(log_root, flog): -+ print("Woohoo") -+ return True -+ -+if __name__ == '__main__': -+ -+ argp = argparse.ArgumentParser(description="FFmpeg filter tester") -+ argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name") -+ args = argp.parse_args() -+ -+ runtest("ATest", args.ffmpeg, ["-v", "verbose", "-no_cvt_hw", "-an", "-c:v", "h264_v4l2m2m", "-i", -+ "/home/johncox/server/TestMedia/Sony/jellyfish-10-mbps-hd-h264.mkv", -+# "/home/jc/rpi/streams/jellyfish-3-mbps-hd-h264.mkv", -+ "-c:v", "h264_v4l2m2m", "-b:v", "2M"], ".mkv", -+ [valid_regex(r'Output stream #0:0 \(video\): 900 frames encoded; 900 packets muxed')]) - -From 50ac318a472fd98e1e58605316ea6a2e8cde0a04 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 5 Jan 2023 14:39:30 +0000 -Subject: [PATCH 111/136] pixfmt: Add a #define to indicate presence of SAND - formats - ---- - libavutil/pixfmt.h | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h -index 22f70007c3..5cc780e7d5 100644 ---- a/libavutil/pixfmt.h -+++ b/libavutil/pixfmt.h -@@ -378,6 +378,8 @@ enum AVPixelFormat { - AV_PIX_FMT_Y210BE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian - AV_PIX_FMT_Y210LE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian - // RPI - not on ifdef so can be got at by calling progs -+// #define so code that uses this can know it is there -+#define AVUTIL_HAVE_PIX_FMT_SAND 1 - AV_PIX_FMT_SAND128, ///< 4:2:0 8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding - AV_PIX_FMT_SAND64_10, ///< 4:2:0 10-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding - AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding - -From 23a3132e094d449ea05657704c0cffc3f0762c28 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 11 Jan 2023 16:30:37 +0000 -Subject: [PATCH 112/136] v4l2_m2m_dec: Fix initial pkt send if no extradata - ---- - libavcodec/v4l2_m2m_dec.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 4d17057298..9daf05adfe 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -240,7 +240,7 @@ copy_extradata(AVCodecContext * const avctx, - else - len = src_len < 0 ? AVERROR(EINVAL) : src_len; - -- // Zero length is OK but we swant to stop - -ve is error val -+ // Zero length is OK but we want to stop - -ve is error val - if (len <= 0) - return len; - -@@ -525,7 +525,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const - - if (s->extdata_sent) - ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); -- else if (s->extdata_data) -+ else - ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size); - - if (ret == AVERROR(EAGAIN)) { - -From f4f6b9f1af137153e574c704804033e83f2ed1a8 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 16 Jan 2023 16:05:09 +0000 -Subject: [PATCH 113/136] v4l2m2m_dec: Make capture timeout long once pending - count > 31 - -For some applications (ffmpeg command line) the current heuristic of adding -a short timeout and preferring DQ over Q once we think we have buffers -Qed in V4L2 is insufficient to prevent arbitrary buffer growth. -Unfortunately the current method of guessing the number of Qed buffers isn't -reliable enough to allow for a long timeout with only a few few buffers -believed pending so only do so once the number of buffers believed pending -exceeds plausible inaccuracies caused by buffer reordering. - -The limit could be optimised by codec or apparent latency but a simple -number should reduce the unexpected consequences. ---- - libavcodec/v4l2_m2m.h | 3 ++- - libavcodec/v4l2_m2m_dec.c | 18 ++++++++++++++---- - 2 files changed, 16 insertions(+), 5 deletions(-) - -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index 0f41f94694..ded1478a49 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -66,7 +66,7 @@ typedef struct pts_stats_s - - typedef struct xlat_track_s { - unsigned int track_no; -- int64_t last_pts; -+ int64_t last_pts; // Last valid PTS decoded - int64_t last_opaque; - V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; - } xlat_track_t; -@@ -88,6 +88,7 @@ typedef struct V4L2m2mContext { - - /* null frame/packet received */ - int draining; -+ int running; - AVPacket buf_pkt; - - /* Reference to a frame. Only used during encoding */ -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 9daf05adfe..c8ab883d7e 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -582,7 +582,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - - do { - const int pending = xlat_pending(&s->xlat); -- const int prefer_dq = (pending > 3); -+ const int prefer_dq = (pending > 4); - const int last_src_rv = src_rv; - - av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt); -@@ -611,10 +611,14 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - // (b) enqueue returned a status indicating that decode should be attempted - if (dst_rv != 0 && TRY_DQ(src_rv)) { - // Pick a timeout depending on state -+ // The pending count isn't completely reliable so it is good enough -+ // hint that we want a frame but not good enough to require it in -+ // all cases; however if it has got > 31 that exceeds its margin of -+ // error so require a frame to prevent ridiculous levels of latency - const int t = - src_rv == NQ_Q_FULL ? -1 : - src_rv == NQ_DRAINING ? 300 : -- prefer_dq ? 5 : 0; -+ prefer_dq ? (s->running && pending > 31 ? 100 : 5) : 0; - - // Dequeue frame will unref any previous contents of frame - // if it returns success so we don't need an explicit unref -@@ -631,8 +635,13 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - } - } - -- if (dst_rv == 0) -+ if (dst_rv == 0) { - set_best_effort_pts(avctx, &s->pts_stat, frame); -+ if (!s->running) { -+ s->running = 1; -+ av_log(avctx, AV_LOG_VERBOSE, "Decode running\n"); -+ } -+ } - - if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { - av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); -@@ -998,7 +1007,8 @@ static void v4l2_decode_flush(AVCodecContext *avctx) - - // resend extradata - s->extdata_sent = 0; -- // clear EOS status vars -+ // clear status vars -+ s->running = 0; - s->draining = 0; - output->done = 0; - capture->done = 0; - -From 39f49cdaefa4483914f703c3f352c8894b3b81fd Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 6 Feb 2023 19:23:16 +0000 -Subject: [PATCH 114/136] Initial buffersink alloc callback code - -(cherry picked from commit dde8d3c8f3cc279b9b92ed4f10a2e3990f4aadeb) ---- - libavfilter/buffersink.c | 44 ++++++++++++++++++++++++++++++++++++++++ - libavfilter/buffersink.h | 3 +++ - 2 files changed, 47 insertions(+) - -diff --git a/libavfilter/buffersink.c b/libavfilter/buffersink.c -index 306c283f77..d3c82aabf3 100644 ---- a/libavfilter/buffersink.c -+++ b/libavfilter/buffersink.c -@@ -62,6 +62,11 @@ typedef struct BufferSinkContext { - int sample_rates_size; - - AVFrame *peeked_frame; -+ -+ union { -+ av_buffersink_alloc_video_frame * video; -+ } alloc_cb; -+ void * alloc_v; - } BufferSinkContext; - - #define NB_ITEMS(list) (list ## _size / sizeof(*list)) -@@ -154,6 +159,44 @@ int attribute_align_arg av_buffersink_get_samples(AVFilterContext *ctx, - return get_frame_internal(ctx, frame, 0, nb_samples); - } - -+static AVFrame * alloc_video_buffer(AVFilterLink *link, int w, int h) -+{ -+ AVFilterContext * const ctx = link->dst; -+ BufferSinkContext * const bs = ctx->priv; -+ return bs->alloc_cb.video ? bs->alloc_cb.video(ctx, bs->alloc_v, w, h) : -+ ff_default_get_video_buffer(link, w, h); -+} -+ -+int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v) -+{ -+ BufferSinkContext * const bs = ctx->priv; -+ bs->alloc_cb.video = cb; -+ bs->alloc_v = v; -+ return 0; -+} -+ -+#if FF_API_BUFFERSINK_ALLOC -+AVBufferSinkParams *av_buffersink_params_alloc(void) -+{ -+ static const int pixel_fmts[] = { AV_PIX_FMT_NONE }; -+ AVBufferSinkParams *params = av_malloc(sizeof(AVBufferSinkParams)); -+ if (!params) -+ return NULL; -+ -+ params->pixel_fmts = pixel_fmts; -+ return params; -+} -+ -+AVABufferSinkParams *av_abuffersink_params_alloc(void) -+{ -+ AVABufferSinkParams *params = av_mallocz(sizeof(AVABufferSinkParams)); -+ -+ if (!params) -+ return NULL; -+ return params; -+} ++#if ARCH_ARM ++#include "libavutil/arm/cpu.h" ++#include "libavutil/arm/rpi_sand_neon.h" ++#elif ARCH_AARCH64 ++#include "libavutil/aarch64/cpu.h" ++#include "libavutil/aarch64/rpi_sand_neon.h" +#endif + - static av_cold int common_init(AVFilterContext *ctx) - { - BufferSinkContext *buf = ctx->priv; -@@ -381,6 +424,7 @@ static const AVFilterPad avfilter_vsink_buffer_inputs[] = { - { - .name = "default", - .type = AVMEDIA_TYPE_VIDEO, -+ .get_buffer = {.video = alloc_video_buffer}, - }, - }; - -diff --git a/libavfilter/buffersink.h b/libavfilter/buffersink.h -index 64e08de53e..09737d322f 100644 ---- a/libavfilter/buffersink.h -+++ b/libavfilter/buffersink.h -@@ -166,6 +166,9 @@ int av_buffersink_get_frame(AVFilterContext *ctx, AVFrame *frame); - */ - int av_buffersink_get_samples(AVFilterContext *ctx, AVFrame *frame, int nb_samples); - -+typedef AVFrame * av_buffersink_alloc_video_frame(AVFilterContext * ctx, void * v, int w, int h); -+int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v); -+ - /** - * @} - */ - -From a63ae21e74ae48f1aedac53c18142b7596d041ad Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 30 Jan 2023 17:23:12 +0000 -Subject: [PATCH 115/136] v4l2_m2m_dec: Add a profile check - -Check the profile in avctx aginst what the v4l2 driver advertises. If -the driver doesn't support the check then just accept anything. - -(cherry picked from commit 6dd83dead9ebce419fdea152db0c9f5e9a94e9ef) ---- - libavcodec/v4l2_m2m_dec.c | 125 ++++++++++++++++++++++++++++++++++++++ - 1 file changed, 125 insertions(+) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index c8ab883d7e..098adf4821 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -715,6 +715,127 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - } - #endif - -+static uint32_t -+avprofile_to_v4l2(const enum AVCodecID codec_id, const int avprofile) ++static inline uint32_t pack30(unsigned int a, unsigned int b, unsigned int c) +{ -+ switch (codec_id) { -+ case AV_CODEC_ID_H264: -+ switch (avprofile) { -+ case FF_PROFILE_H264_BASELINE: -+ return V4L2_MPEG_VIDEO_H264_PROFILE_BASELINE; -+ case FF_PROFILE_H264_CONSTRAINED_BASELINE: -+ return V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_BASELINE; -+ case FF_PROFILE_H264_MAIN: -+ return V4L2_MPEG_VIDEO_H264_PROFILE_MAIN; -+ case FF_PROFILE_H264_EXTENDED: -+ return V4L2_MPEG_VIDEO_H264_PROFILE_EXTENDED; -+ case FF_PROFILE_H264_HIGH: -+ return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH; -+ case FF_PROFILE_H264_HIGH_10: -+ return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10; -+ case FF_PROFILE_H264_HIGH_10_INTRA: -+ return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10_INTRA; -+ case FF_PROFILE_H264_MULTIVIEW_HIGH: -+ case FF_PROFILE_H264_HIGH_422: -+ return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422; -+ case FF_PROFILE_H264_HIGH_422_INTRA: -+ return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422_INTRA; -+ case FF_PROFILE_H264_STEREO_HIGH: -+ return V4L2_MPEG_VIDEO_H264_PROFILE_STEREO_HIGH; -+ case FF_PROFILE_H264_HIGH_444_PREDICTIVE: -+ return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_PREDICTIVE; -+ case FF_PROFILE_H264_HIGH_444_INTRA: -+ return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_INTRA; -+ case FF_PROFILE_H264_CAVLC_444: -+ return V4L2_MPEG_VIDEO_H264_PROFILE_CAVLC_444_INTRA; -+ case FF_PROFILE_H264_HIGH_444: -+ default: -+ break; -+// V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_BASELINE = 12, -+// V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH = 13, -+// V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH_INTRA = 14, -+// V4L2_MPEG_VIDEO_H264_PROFILE_MULTIVIEW_HIGH = 16, -+// V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_HIGH = 17, -+ } -+ break; -+ case AV_CODEC_ID_MPEG2VIDEO: -+ case AV_CODEC_ID_MPEG4: -+ case AV_CODEC_ID_VC1: -+ case AV_CODEC_ID_VP8: -+ case AV_CODEC_ID_VP9: -+ case AV_CODEC_ID_AV1: -+ // Most profiles are a simple number that matches the V4L2 enum -+ return avprofile; -+ default: -+ break; -+ } -+ return ~(uint32_t)0; ++ return (a & 0x3ff) | ((b & 0x3ff) << 10) | ((c & 0x3ff) << 20); +} + -+// This check mirrors Chrome's profile check by testing to see if the profile -+// exists as a possible value for the V4L2 profile control -+static int -+check_profile(AVCodecContext *const avctx, V4L2m2mContext *const s) ++void checkasm_check_rpi_sand(void) +{ -+ struct v4l2_queryctrl query_ctrl; -+ struct v4l2_querymenu query_menu; -+ uint32_t profile_id; ++ const unsigned int w = 1280; ++ const unsigned int h = 66; ++ const unsigned int stride1 = 128; ++ const unsigned int stride2 = h*3/2; ++ const unsigned int ssize = ((w+95)/96)*128*h*3/2; ++ const unsigned int ysize = ((w + 32) * (h + 32) * 2); + -+ // An unset profile is almost certainly zero or -99 - do not reject -+ if (avctx->profile <= 0) { -+ av_log(avctx, AV_LOG_VERBOSE, "Profile <= 0 - check skipped\n"); -+ return 0; ++ uint8_t * sbuf0 = malloc(ssize); ++ uint8_t * sbuf1 = malloc(ssize); ++ uint8_t * ybuf0 = malloc(ysize); ++ uint8_t * ybuf1 = malloc(ysize); ++ uint8_t * vbuf0 = malloc(ysize); ++ uint8_t * vbuf1 = malloc(ysize); ++ uint8_t * yframe0 = (w + 32) * 16 + ybuf0; ++ uint8_t * yframe1 = (w + 32) * 16 + ybuf1; ++ uint8_t * vframe0 = (w + 32) * 16 + vbuf0; ++ uint8_t * vframe1 = (w + 32) * 16 + vbuf1; ++ unsigned int i; ++ ++ for (i = 0; i != ssize; i += 4) ++ *(uint32_t*)(sbuf0 + i) = rnd(); ++ memcpy(sbuf1, sbuf0, ssize); ++ ++ if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_y16 : av_rpi_sand30_to_planar_y16, "rpi_sand30_to_planar_y16")) { ++ declare_func(void, uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++ memset(ybuf0, 0xbb, ysize); ++ memset(ybuf1, 0xbb, ysize); ++ ++ call_ref(yframe0, (w + 32) * 2, sbuf0, stride1, stride2, 0, 0, w, h); ++ call_new(yframe1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h); ++ ++ if (memcmp(sbuf0, sbuf1, ssize) ++ || memcmp(ybuf0, ybuf1, ysize)) ++ fail(); ++ ++ bench_new(ybuf1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h); + } + -+ memset(&query_ctrl, 0, sizeof(query_ctrl)); -+ switch (avctx->codec_id) { -+ case AV_CODEC_ID_MPEG2VIDEO: -+ profile_id = V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE; -+ break; -+ case AV_CODEC_ID_MPEG4: -+ profile_id = V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE; -+ break; -+ case AV_CODEC_ID_H264: -+ profile_id = V4L2_CID_MPEG_VIDEO_H264_PROFILE; -+ break; -+ case AV_CODEC_ID_VP8: -+ profile_id = V4L2_CID_MPEG_VIDEO_VP8_PROFILE; -+ break; -+ case AV_CODEC_ID_VP9: -+ profile_id = V4L2_CID_MPEG_VIDEO_VP9_PROFILE; -+ break; -+#ifdef V4L2_CID_MPEG_VIDEO_AV1_PROFILE -+ case AV_CODEC_ID_AV1: -+ profile_id = V4L2_CID_MPEG_VIDEO_AV1_PROFILE; -+ break; -+#endif -+ default: -+ av_log(avctx, AV_LOG_VERBOSE, "Can't map profile for codec id %d; profile check skipped\n", avctx->codec_id); -+ return 0; ++ if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_c16 : av_rpi_sand30_to_planar_c16, "rpi_sand30_to_planar_c16")) { ++ declare_func(void, uint8_t * u_dst, const unsigned int u_stride, ++ uint8_t * v_dst, const unsigned int v_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++ memset(ybuf0, 0xbb, ysize); ++ memset(ybuf1, 0xbb, ysize); ++ memset(vbuf0, 0xbb, ysize); ++ memset(vbuf1, 0xbb, ysize); ++ ++ call_ref(yframe0, (w + 32), vframe0, (w + 32), sbuf0, stride1, stride2, 0, 0, w/2, h/2); ++ call_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2); ++ ++ if (memcmp(sbuf0, sbuf1, ssize) ++ || memcmp(ybuf0, ybuf1, ysize) ++ || memcmp(vbuf0, vbuf1, ysize)) ++ fail(); ++ ++ bench_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2); + } + -+ query_ctrl = (struct v4l2_queryctrl){.id = profile_id}; -+ if (ioctl(s->fd, VIDIOC_QUERYCTRL, &query_ctrl) != 0) { -+ av_log(avctx, AV_LOG_VERBOSE, "Query profile ctrl (%#x) not supported: assume OK\n", query_ctrl.id); ++ ++ report("sand30"); ++ ++ free(sbuf0); ++ free(sbuf1); ++ free(ybuf0); ++ free(ybuf1); ++ free(vbuf0); ++ free(vbuf1); ++} ++ +diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c +new file mode 100644 +index 0000000000..52628d15e4 +--- /dev/null ++++ b/tests/checkasm/vc1dsp.c +@@ -0,0 +1,452 @@ ++/* ++ * Copyright (c) 2022 Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with FFmpeg; if not, write to the Free Software Foundation, Inc., ++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ++ */ ++ ++#include ++ ++#include "checkasm.h" ++ ++#include "libavcodec/vc1dsp.h" ++ ++#include "libavutil/common.h" ++#include "libavutil/internal.h" ++#include "libavutil/intreadwrite.h" ++#include "libavutil/mem_internal.h" ++ ++#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) }, ++#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height }, ++ ++typedef struct { ++ const char *name; ++ size_t offset; ++ int width; ++ int height; ++} test; ++ ++typedef struct matrix { ++ size_t width; ++ size_t height; ++ float d[]; ++} matrix; ++ ++static const matrix T8 = { 8, 8, { ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 16, 15, 9, 4, -4, -9, -15, -16, ++ 16, 6, -6, -16, -16, -6, 6, 16, ++ 15, -4, -16, -9, 9, 16, 4, -15, ++ 12, -12, -12, 12, 12, -12, -12, 12, ++ 9, -16, 4, 15, -15, -4, 16, -9, ++ 6, -16, 16, -6, -6, 16, -16, 6, ++ 4, -9, 15, -16, 16, -15, 9, -4 ++} }; ++ ++static const matrix T4 = { 4, 4, { ++ 17, 17, 17, 17, ++ 22, 10, -10, -22, ++ 17, -17, -17, 17, ++ 10, -22, 22, -10 ++} }; ++ ++static const matrix T8t = { 8, 8, { ++ 12, 16, 16, 15, 12, 9, 6, 4, ++ 12, 15, 6, -4, -12, -16, -16, -9, ++ 12, 9, -6, -16, -12, 4, 16, 15, ++ 12, 4, -16, -9, 12, 15, -6, -16, ++ 12, -4, -16, 9, 12, -15, -6, 16, ++ 12, -9, -6, 16, -12, -4, 16, -15, ++ 12, -15, 6, 4, -12, 16, -16, 9, ++ 12, -16, 16, -15, 12, -9, 6, -4 ++} }; ++ ++static const matrix T4t = { 4, 4, { ++ 17, 22, 17, 10, ++ 17, 10, -17, -22, ++ 17, -10, -17, 22, ++ 17, -22, 17, -10 ++} }; ++ ++static matrix *new_matrix(size_t width, size_t height) ++{ ++ matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float)); ++ if (out == NULL) { ++ fprintf(stderr, "Memory allocation failure\n"); ++ exit(EXIT_FAILURE); + } -+ else { -+ av_log(avctx, AV_LOG_DEBUG, "%s: Control supported: %#x\n", __func__, query_ctrl.id); ++ out->width = width; ++ out->height = height; ++ return out; ++} + -+ query_menu = (struct v4l2_querymenu){ -+ .id = query_ctrl.id, -+ .index = avprofile_to_v4l2(avctx->codec_id, avctx->profile), -+ }; -+ -+ if (query_menu.index > query_ctrl.maximum || -+ query_menu.index < query_ctrl.minimum || -+ ioctl(s->fd, VIDIOC_QUERYMENU, &query_menu) != 0) { -+ return AVERROR(ENOENT); -+ } ++static matrix *multiply(const matrix *a, const matrix *b) ++{ ++ matrix *out; ++ if (a->width != b->height) { ++ fprintf(stderr, "Incompatible multiplication\n"); ++ exit(EXIT_FAILURE); + } -+ -+ return 0; -+}; -+ - static int - check_size(AVCodecContext * const avctx, V4L2m2mContext * const s) - { -@@ -955,6 +1076,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - if ((ret = check_size(avctx, s)) != 0) - return ret; - -+ if ((ret = check_profile(avctx, s)) != 0) { -+ av_log(avctx, AV_LOG_WARNING, "Profile %d not supported by decode\n", avctx->profile); -+ return ret; -+ } - return 0; - } - - -From f734a6ead04a8381fccfae53066866a02a9516d2 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 1 Feb 2023 17:24:39 +0000 -Subject: [PATCH 116/136] v4l2_m2m_dec: Add extradata parse for h264 & hevc - -If we have extradata we can extract profile & level and potentailly -other useful info from it. Use the codec parser to get it if the decoder -is configured. - -(cherry picked from commit 6d431e79adeb246c2ed8cebce9011d81175a3906) ---- - libavcodec/v4l2_m2m_dec.c | 84 ++++++++++++++++++++++++++++++++++++++- - 1 file changed, 83 insertions(+), 1 deletion(-) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 098adf4821..e64bc707d3 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -21,6 +21,8 @@ - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -+#include "config.h" -+ - #include - #include - -@@ -43,6 +45,13 @@ - #include "v4l2_fmt.h" - #include "v4l2_req_dmabufs.h" - -+#if CONFIG_H264_DECODER -+#include "h264_parse.h" -+#endif -+#if CONFIG_HEVC_DECODER -+#include "hevc_parse.h" -+#endif -+ - // Pick 64 for max last count - that is >1sec at 60fps - #define STATS_LAST_COUNT_MAX 64 - #define STATS_INTERVAL_MAX (1 << 30) -@@ -956,6 +965,78 @@ static uint32_t max_coded_size(const AVCodecContext * const avctx) - return size + (1 << 16); - } - -+static void -+parse_extradata(AVCodecContext *avctx) -+{ -+ if (!avctx->extradata || !avctx->extradata_size) -+ return; -+ -+ switch (avctx->codec_id) { -+#if CONFIG_H264_DECODER -+ case AV_CODEC_ID_H264: -+ { -+ H264ParamSets ps = {{NULL}}; -+ int is_avc = 0; -+ int nal_length_size = 0; -+ int ret; -+ -+ ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size, -+ &ps, &is_avc, &nal_length_size, -+ avctx->err_recognition, avctx); -+ if (ret > 0) { -+ const SPS * sps = NULL; -+ unsigned int i; -+ for (i = 0; i != MAX_SPS_COUNT; ++i) { -+ if (ps.sps_list[i]) { -+ sps = (const SPS *)ps.sps_list[i]->data; -+ break; -+ } -+ } -+ if (sps) { -+ avctx->profile = ff_h264_get_profile(sps); -+ avctx->level = sps->level_idc; -+ } -+ } -+ ff_h264_ps_uninit(&ps); -+ break; ++ out = new_matrix(b->width, a->height); ++ for (int j = 0; j < out->height; ++j) ++ for (int i = 0; i < out->width; ++i) { ++ float sum = 0; ++ for (int k = 0; k < a->width; ++k) ++ sum += a->d[j * a->width + k] * b->d[k * b->width + i]; ++ out->d[j * out->width + i] = sum; + } -+#endif -+#if CONFIG_HEVC_DECODER -+ case AV_CODEC_ID_HEVC: -+ { -+ HEVCParamSets ps = {{NULL}}; -+ HEVCSEI sei = {{{{0}}}}; -+ int is_nalff = 0; -+ int nal_length_size = 0; -+ int ret; -+ -+ ret = ff_hevc_decode_extradata(avctx->extradata, avctx->extradata_size, -+ &ps, &sei, &is_nalff, &nal_length_size, -+ avctx->err_recognition, 0, avctx); -+ if (ret > 0) { -+ const HEVCSPS * sps = NULL; -+ unsigned int i; -+ for (i = 0; i != HEVC_MAX_SPS_COUNT; ++i) { -+ if (ps.sps_list[i]) { -+ sps = (const HEVCSPS *)ps.sps_list[i]->data; -+ break; -+ } -+ } -+ if (sps) { -+ avctx->profile = sps->ptl.general_ptl.profile_idc; -+ avctx->level = sps->ptl.general_ptl.level_idc; -+ } -+ } -+ ff_hevc_ps_uninit(&ps); -+ ff_hevc_reset_sei(&sei); -+ break; -+ } -+#endif -+ default: -+ break; -+ } ++ return out; +} + - static av_cold int v4l2_decode_init(AVCodecContext *avctx) - { - V4L2Context *capture, *output; -@@ -976,7 +1057,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - avctx->ticks_per_frame = 2; - } - -- av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level); -+ parse_extradata(avctx); -+ - ret = ff_v4l2_m2m_create_context(priv, &s); - if (ret < 0) - return ret; - -From e28421e397743a94f5e37327ad234f59b6ae613d Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 20 Mar 2023 18:12:51 +0000 -Subject: [PATCH 117/136] clean_usr_libs: Now wipes the include files too - -When swapping ffmpeg versions obsolete makefiles could confuse -configure utilities. ---- - pi-util/clean_usr_libs.sh | 16 ++++++++++++++++ - 1 file changed, 16 insertions(+) - -diff --git a/pi-util/clean_usr_libs.sh b/pi-util/clean_usr_libs.sh -index b3b2d5509d..01bd6a6a22 100755 ---- a/pi-util/clean_usr_libs.sh -+++ b/pi-util/clean_usr_libs.sh -@@ -1,4 +1,20 @@ - set -e -+U=/usr/include/arm-linux-gnueabihf -+rm -rf $U/libavcodec -+rm -rf $U/libavdevice -+rm -rf $U/libavfilter -+rm -rf $U/libavformat -+rm -rf $U/libavutil -+rm -rf $U/libswresample -+rm -rf $U/libswscale -+U=/usr/include/aarch64-linux-gnu -+rm -rf $U/libavcodec -+rm -rf $U/libavdevice -+rm -rf $U/libavfilter -+rm -rf $U/libavformat -+rm -rf $U/libavutil -+rm -rf $U/libswresample -+rm -rf $U/libswscale - U=/usr/lib/arm-linux-gnueabihf - rm -f $U/libavcodec.* - rm -f $U/libavdevice.* - -From dcabd30310b88b45359609bac27d5d0f9bbc6dc1 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 20 Mar 2023 18:15:08 +0000 -Subject: [PATCH 118/136] vulkan: Add missing decode extension defines - -When building on bookworm the video decode extension names -were missing. This adds them. I expect this patch will be -obsolete shortly but it solves a current problem. ---- - libavutil/hwcontext_vulkan.c | 8 ++++++++ - 1 file changed, 8 insertions(+) - -diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c -index 2a9b5f4aac..11e7945f18 100644 ---- a/libavutil/hwcontext_vulkan.c -+++ b/libavutil/hwcontext_vulkan.c -@@ -57,6 +57,14 @@ - #define CHECK_CU(x) FF_CUDA_CHECK_DL(cuda_cu, cu, x) - #endif - -+// Sometimes missing definitions -+#ifndef VK_EXT_VIDEO_DECODE_H264_EXTENSION_NAME -+#define VK_EXT_VIDEO_DECODE_H264_EXTENSION_NAME "VK_EXT_video_decode_h264" -+#endif -+#ifndef VK_EXT_VIDEO_DECODE_H265_EXTENSION_NAME -+#define VK_EXT_VIDEO_DECODE_H265_EXTENSION_NAME "VK_EXT_video_decode_h265" -+#endif -+ - typedef struct VulkanQueueCtx { - VkFence fence; - VkQueue queue; - -From 0231c208843a5badc799590eb5b9de907d1c26b2 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 21 Mar 2023 14:20:05 +0000 -Subject: [PATCH 119/136] v4l2_m2m_dec: Fix config file for finding if decoder - enabled - -Fixes parsing of extradata for profile testing. 5.x changed where that -info is defined. ---- - libavcodec/v4l2_m2m_dec.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index e64bc707d3..91136f03da 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -21,7 +21,7 @@ - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - --#include "config.h" -+#include "config_components.h" - - #include - #include - -From 822baefed69372b3380144ab44226e2c6ad3e298 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 21 Mar 2023 14:23:20 +0000 -Subject: [PATCH 120/136] v4l2_m2m_dec: Display profile given if skipped in - debug - ---- - libavcodec/v4l2_m2m_dec.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 91136f03da..d124c7b1fc 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -792,7 +792,7 @@ check_profile(AVCodecContext *const avctx, V4L2m2mContext *const s) - - // An unset profile is almost certainly zero or -99 - do not reject - if (avctx->profile <= 0) { -- av_log(avctx, AV_LOG_VERBOSE, "Profile <= 0 - check skipped\n"); -+ av_log(avctx, AV_LOG_VERBOSE, "Profile %d <= 0 - check skipped\n", avctx->profile); - return 0; - } - - -From 6859fc2a8791c0fcc25851b77fed15a691ceb332 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 22 Mar 2023 16:08:08 +0000 -Subject: [PATCH 121/136] conf_native: Fix for 64-bit kernel with 32-bit - userspace - -(cherry picked from commit 5bb1e09cea95b4215c6904b9b1a726e83bc5d327) ---- - pi-util/conf_native.sh | 32 +++++++++++++++++++++----------- - 1 file changed, 21 insertions(+), 11 deletions(-) - -diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh -index 082d9b5832..0a7d230f1b 100755 ---- a/pi-util/conf_native.sh -+++ b/pi-util/conf_native.sh -@@ -33,18 +33,28 @@ RPI_LIBDIRS= - RPI_DEFINES= - RPI_EXTRALIBS= - --if [ "$MC" == "arm64" ]; then -- echo "M/C aarch64" -- A=aarch64-linux-gnu -- B=arm64 --elif [ "$MC" == "armhf" ]; then -- echo "M/C armv7" -- A=arm-linux-gnueabihf -- B=armv7 -- MCOPTS="--arch=armv6t2 --cpu=cortex-a7" -- RPI_DEFINES=-mfpu=neon-vfpv4 -+# uname -m gives kernel type which may not have the same -+# 32/64bitness as userspace :-( getconf shoudl provide the answer -+# but use uname to check we are on the right processor -+MC=`uname -m` -+LB=`getconf LONG_BIT` -+if [ "$MC" == "armv7l" ] || [ "$MC" == "aarch64" ]; then -+ if [ "$LB" == "32" ]; then -+ echo "M/C armv7" -+ A=arm-linux-gnueabihf -+ B=armv7 -+ MCOPTS="--arch=armv6t2 --cpu=cortex-a7" -+ RPI_DEFINES=-mfpu=neon-vfpv4 -+ elif [ "$LB" == "64" ]; then -+ echo "M/C aarch64" -+ A=aarch64-linux-gnu -+ B=arm64 -+ else -+ echo "Unknown LONG_BIT name: $LB" -+ exit 1 -+ fi - else -- echo Unexpected architecture $MC -+ echo "Unknown machine name: $MC" - exit 1 - fi - - -From c35f074854a922c0c025159ddddd1abfc562a3d2 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 20 Apr 2023 11:48:25 +0000 -Subject: [PATCH 122/136] conf_native: Add install prefix variation - -(cherry picked from commit 73c3019b534cb8f4b4e4c21995653f6ce440086d) ---- - pi-util/BUILD.txt | 32 ++++++++++++++++++++------------ - pi-util/conf_native.sh | 14 ++++++++++++-- - 2 files changed, 32 insertions(+), 14 deletions(-) - -diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt -index b050971f63..2b62d660c0 100644 ---- a/pi-util/BUILD.txt -+++ b/pi-util/BUILD.txt -@@ -24,6 +24,8 @@ There are a few choices here - paths being confused and therefore running the wrong code, Shared - is what is needed, in most cases, when building for use by other - programs. -+ --usr Set install dir to /usr (i.e. system default) rather than in -+ /install - - So for a static build - --------------------- -@@ -37,23 +39,29 @@ You can now run ffmpeg directly from where it was built - For a shared build - ------------------ - --$ pi-util/conf_native.sh -- --You will normally want an install target if shared. Note that the script has --set this up to be generated in out//install, you don't have to worry --about overwriting your system libs. -+There are two choices here - -+$ pi-util/conf_native.sh - $ make -j8 -C out/ install - --You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was --built or install the image on the system - you have to be careful to get rid --of all other ffmpeg libs or confusion may result. There is a little script --that wipes all other versions - obviously use with care! -+This sets the install prefix to /install and is probably what you -+want if you don't want to overwrite the system files. - --$ sudo pi-util/clean_usr_libs.sh -+You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was -+built. You can copy the contents of /install to /usr and that mostly -+works. The only downside is that paths in pkgconfig end up being set to the -+install directory in your build directory which may be less than ideal when -+building other packages. - --Then simply copying from the install to /usr works -+The alternative if you just want to replace the system libs is: - --$ sudo cp -r out//install/* /usr -+$ pi-util/conf_native.sh --usr -+$ make -j8 -C out/ -+$ sudo pi-util/clean_usr_libs.sh -+$ sudo make -j8 -C out/ install - -+The clean_usr_libs.sh step wipes any existing libs & includes (for all -+architectures) from the system which helps avoid confusion when running other -+progs as you can be sure you're not running old code which is unfortunately -+easy to do otherwise. - -diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh -index 0a7d230f1b..f0ed159594 100755 ---- a/pi-util/conf_native.sh -+++ b/pi-util/conf_native.sh -@@ -9,6 +9,7 @@ RPI_KEEPS="" - - NOSHARED= - MMAL= -+USR_PREFIX= - - while [ "$1" != "" ] ; do - case $1 in -@@ -18,8 +19,14 @@ while [ "$1" != "" ] ; do - --mmal) - MMAL=1 - ;; -+ --usr) -+ USR_PREFIX=/usr -+ ;; - *) -- echo "Usage $0: [--noshared] [--mmal]" -+ echo "Usage $0: [--noshared] [--mmal] [--usr]" -+ echo " noshared Build static libs and executable - good for testing" -+ echo " mmal Build mmal decoders" -+ echo " usr Set install prefix to /usr [default=/install]" - exit 1 - ;; - esac -@@ -82,7 +89,9 @@ else - OUT=$BUILDBASE/$B-$C-$V-shared-rel - fi - --USR_PREFIX=$OUT/install -+if [ ! $USR_PREFIX ]; then -+ USR_PREFIX=$OUT/install -+fi - LIB_PREFIX=$USR_PREFIX/lib/$A - INC_PREFIX=$USR_PREFIX/include/$A - -@@ -113,6 +122,7 @@ $FFSRC/configure \ - --extra-libs="$RPI_EXTRALIBS"\ - --extra-version="rpi" - -+echo "Configured into $OUT" - - # gcc option for getting asm listing - # -Wa,-ahls - -From 91ea652a95370a428f1353932b2a55dae7158acc Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 19 Apr 2023 10:47:58 +0000 -Subject: [PATCH 123/136] swcale: Add explicit bgr24->yv12 conversion - -(cherry picked from commit 9a22d429f46a038321c66a0cd54737177641b434) ---- - libswscale/rgb2rgb.c | 5 +++++ - libswscale/rgb2rgb.h | 7 +++++++ - libswscale/rgb2rgb_template.c | 36 ++++++++++++++++++++++++++++++----- - libswscale/swscale_unscaled.c | 22 +++++++++++++++++++++ - 4 files changed, 65 insertions(+), 5 deletions(-) - -diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c -index e98fdac8ea..84bb56e60e 100644 ---- a/libswscale/rgb2rgb.c -+++ b/libswscale/rgb2rgb.c -@@ -83,6 +83,11 @@ void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, - int width, int height, - int lumStride, int chromStride, int srcStride, - int32_t *rgb2yuv); -+void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, -+ uint8_t *udst, uint8_t *vdst, -+ int width, int height, -+ int lumStride, int chromStride, int srcStride, -+ int32_t *rgb2yuv); - void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, - int srcStride, int dstStride); - void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst, -diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h -index f3951d523e..0028ab345f 100644 ---- a/libswscale/rgb2rgb.h -+++ b/libswscale/rgb2rgb.h -@@ -79,6 +79,9 @@ void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size); - void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - uint8_t *vdst, int width, int height, int lumStride, - int chromStride, int srcStride, int32_t *rgb2yuv); -+void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -+ uint8_t *vdst, int width, int height, int lumStride, -+ int chromStride, int srcStride, int32_t *rgb2yuv); - - /** - * Height should be a multiple of 2 and width should be a multiple of 16. -@@ -128,6 +131,10 @@ extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - int width, int height, - int lumStride, int chromStride, int srcStride, - int32_t *rgb2yuv); -+extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, -+ int width, int height, -+ int lumStride, int chromStride, int srcStride, -+ int32_t *rgb2yuv); - extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, - int srcStride, int dstStride); - -diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c -index 42c69801ba..e2437826dd 100644 ---- a/libswscale/rgb2rgb_template.c -+++ b/libswscale/rgb2rgb_template.c -@@ -646,13 +646,14 @@ static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst, - * others are ignored in the C version. - * FIXME: Write HQ version. - */ --void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -+static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - uint8_t *vdst, int width, int height, int lumStride, -- int chromStride, int srcStride, int32_t *rgb2yuv) -+ int chromStride, int srcStride, int32_t *rgb2yuv, -+ const uint8_t x[9]) - { -- int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; -- int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; -- int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; -+ int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]]; -+ int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]]; -+ int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]]; - int y; - const int chromWidth = width >> 1; - -@@ -707,6 +708,30 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - } - } - -+void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -+ uint8_t *vdst, int width, int height, int lumStride, -+ int chromStride, int srcStride, int32_t *rgb2yuv) ++static void normalise(matrix *a) +{ -+ static const uint8_t x[9] = { -+ RY_IDX, GY_IDX, BY_IDX, -+ RU_IDX, GU_IDX, BU_IDX, -+ RV_IDX, GV_IDX, BV_IDX, -+ }; -+ rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x); -+} -+ -+void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -+ uint8_t *vdst, int width, int height, int lumStride, -+ int chromStride, int srcStride, int32_t *rgb2yuv) -+{ -+ static const uint8_t x[9] = { -+ BY_IDX, GY_IDX, RY_IDX, -+ BU_IDX, GU_IDX, RU_IDX, -+ BV_IDX, GV_IDX, RV_IDX, -+ }; -+ rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x); -+} -+ - static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2, - uint8_t *dest, int width, int height, - int src1Stride, int src2Stride, int dstStride) -@@ -980,6 +1005,7 @@ static av_cold void rgb2rgb_init_c(void) - yuy2toyv12 = yuy2toyv12_c; - planar2x = planar2x_c; - ff_rgb24toyv12 = ff_rgb24toyv12_c; -+ ff_bgr24toyv12 = ff_bgr24toyv12_c; - interleaveBytes = interleaveBytes_c; - deinterleaveBytes = deinterleaveBytes_c; - vu9_to_vu12 = vu9_to_vu12_c; -diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c -index 9af2e7ecc3..9047030ae4 100644 ---- a/libswscale/swscale_unscaled.c -+++ b/libswscale/swscale_unscaled.c -@@ -1654,6 +1654,23 @@ static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t *src[], - return srcSliceH; - } - -+static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[], -+ int srcStride[], int srcSliceY, int srcSliceH, -+ uint8_t *dst[], int dstStride[]) -+{ -+ ff_bgr24toyv12( -+ src[0], -+ dst[0] + srcSliceY * dstStride[0], -+ dst[1] + (srcSliceY >> 1) * dstStride[1], -+ dst[2] + (srcSliceY >> 1) * dstStride[2], -+ c->srcW, srcSliceH, -+ dstStride[0], dstStride[1], srcStride[0], -+ c->input_rgb2yuv_table); -+ if (dst[3]) -+ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); -+ return srcSliceH; -+} -+ - static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[], - int srcStride[], int srcSliceY, int srcSliceH, - uint8_t *dst[], int dstStride[]) -@@ -2037,6 +2054,11 @@ void ff_get_unscaled_swscale(SwsContext *c) - (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) && - !(flags & SWS_ACCURATE_RND) && !(dstW&1)) - c->convert_unscaled = bgr24ToYv12Wrapper; -+ /* rgb24toYV12 */ -+ if (srcFormat == AV_PIX_FMT_RGB24 && -+ (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) && -+ !(flags & SWS_ACCURATE_RND) && !(dstW&1)) -+ c->convert_unscaled = rgb24ToYv12Wrapper; - - /* RGB/BGR -> RGB/BGR (no dither needed forms) */ - if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c) - -From 207ea47b2153b276b53cd5a87528dbc532a9f551 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 20 Apr 2023 11:26:10 +0000 -Subject: [PATCH 124/136] swscale: Add unscaled XRGB->YUV420P functions - -(cherry picked from commit 04cc32ee3f390de513ad8c6156c0c66b2c60abc8) ---- - libswscale/rgb2rgb.c | 20 ++++++ - libswscale/rgb2rgb.h | 16 +++++ - libswscale/rgb2rgb_template.c | 123 ++++++++++++++++++++++++++++++---- - libswscale/swscale_unscaled.c | 89 ++++++++++++++++++++++++ - 4 files changed, 236 insertions(+), 12 deletions(-) - -diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c -index 84bb56e60e..c3b9079d2b 100644 ---- a/libswscale/rgb2rgb.c -+++ b/libswscale/rgb2rgb.c -@@ -88,6 +88,26 @@ void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, - int width, int height, - int lumStride, int chromStride, int srcStride, - int32_t *rgb2yuv); -+void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, -+ uint8_t *udst, uint8_t *vdst, -+ int width, int height, -+ int lumStride, int chromStride, int srcStride, -+ int32_t *rgb2yuv); -+void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, -+ uint8_t *udst, uint8_t *vdst, -+ int width, int height, -+ int lumStride, int chromStride, int srcStride, -+ int32_t *rgb2yuv); -+void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, -+ uint8_t *udst, uint8_t *vdst, -+ int width, int height, -+ int lumStride, int chromStride, int srcStride, -+ int32_t *rgb2yuv); -+void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, -+ uint8_t *udst, uint8_t *vdst, -+ int width, int height, -+ int lumStride, int chromStride, int srcStride, -+ int32_t *rgb2yuv); - void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, - int srcStride, int dstStride); - void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst, -diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h -index 0028ab345f..a0dd3ffb79 100644 ---- a/libswscale/rgb2rgb.h -+++ b/libswscale/rgb2rgb.h -@@ -135,6 +135,22 @@ extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - int width, int height, - int lumStride, int chromStride, int srcStride, - int32_t *rgb2yuv); -+extern void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, -+ int width, int height, -+ int lumStride, int chromStride, int srcStride, -+ int32_t *rgb2yuv); -+extern void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, -+ int width, int height, -+ int lumStride, int chromStride, int srcStride, -+ int32_t *rgb2yuv); -+extern void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, -+ int width, int height, -+ int lumStride, int chromStride, int srcStride, -+ int32_t *rgb2yuv); -+extern void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, -+ int width, int height, -+ int lumStride, int chromStride, int srcStride, -+ int32_t *rgb2yuv); - extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, - int srcStride, int dstStride); - -diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c -index e2437826dd..703de90690 100644 ---- a/libswscale/rgb2rgb_template.c -+++ b/libswscale/rgb2rgb_template.c -@@ -708,30 +708,125 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - } - } - -+static const uint8_t x_rgb[9] = { -+ RY_IDX, GY_IDX, BY_IDX, -+ RU_IDX, GU_IDX, BU_IDX, -+ RV_IDX, GV_IDX, BV_IDX, -+}; -+ -+static const uint8_t x_bgr[9] = { -+ BY_IDX, GY_IDX, RY_IDX, -+ BU_IDX, GU_IDX, RU_IDX, -+ BV_IDX, GV_IDX, RV_IDX, -+}; -+ - void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - uint8_t *vdst, int width, int height, int lumStride, - int chromStride, int srcStride, int32_t *rgb2yuv) - { -- static const uint8_t x[9] = { -- RY_IDX, GY_IDX, BY_IDX, -- RU_IDX, GU_IDX, BU_IDX, -- RV_IDX, GV_IDX, BV_IDX, -- }; -- rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x); -+ rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb); - } - - void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - uint8_t *vdst, int width, int height, int lumStride, - int chromStride, int srcStride, int32_t *rgb2yuv) - { -- static const uint8_t x[9] = { -- BY_IDX, GY_IDX, RY_IDX, -- BU_IDX, GU_IDX, RU_IDX, -- BV_IDX, GV_IDX, RV_IDX, -- }; -- rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x); -+ rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr); - } - -+static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -+ uint8_t *vdst, int width, int height, int lumStride, -+ int chromStride, int srcStride, int32_t *rgb2yuv, -+ const uint8_t x[9]) -+{ -+ int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]]; -+ int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]]; -+ int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]]; -+ int y; -+ const int chromWidth = width >> 1; -+ -+ for (y = 0; y < height; y += 2) { -+ int i; -+ for (i = 0; i < chromWidth; i++) { -+ unsigned int b = src[8 * i + 2]; -+ unsigned int g = src[8 * i + 1]; -+ unsigned int r = src[8 * i + 0]; -+ -+ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; -+ unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128; -+ unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128; -+ -+ udst[i] = U; -+ vdst[i] = V; -+ ydst[2 * i] = Y; -+ -+ b = src[8 * i + 6]; -+ g = src[8 * i + 5]; -+ r = src[8 * i + 4]; -+ -+ Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; -+ ydst[2 * i + 1] = Y; -+ } -+ ydst += lumStride; -+ src += srcStride; -+ -+ if (y+1 == height) -+ break; -+ -+ for (i = 0; i < chromWidth; i++) { -+ unsigned int b = src[8 * i + 2]; -+ unsigned int g = src[8 * i + 1]; -+ unsigned int r = src[8 * i + 0]; -+ -+ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; -+ -+ ydst[2 * i] = Y; -+ -+ b = src[8 * i + 6]; -+ g = src[8 * i + 5]; -+ r = src[8 * i + 4]; -+ -+ Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; -+ ydst[2 * i + 1] = Y; -+ } -+ udst += chromStride; -+ vdst += chromStride; -+ ydst += lumStride; -+ src += srcStride; -+ } -+} -+ -+static void ff_rgbxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -+ uint8_t *vdst, int width, int height, int lumStride, -+ int chromStride, int srcStride, int32_t *rgb2yuv) -+{ -+ rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb); -+} -+ -+static void ff_bgrxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -+ uint8_t *vdst, int width, int height, int lumStride, -+ int chromStride, int srcStride, int32_t *rgb2yuv) -+{ -+ rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr); -+} -+ -+// As the general code does no SIMD-like ops simply adding 1 to the src address -+// will fix the ignored alpha position -+static void ff_xrgbtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -+ uint8_t *vdst, int width, int height, int lumStride, -+ int chromStride, int srcStride, int32_t *rgb2yuv) -+{ -+ rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb); -+} -+ -+static void ff_xbgrtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -+ uint8_t *vdst, int width, int height, int lumStride, -+ int chromStride, int srcStride, int32_t *rgb2yuv) -+{ -+ rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr); -+} -+ -+ - static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2, - uint8_t *dest, int width, int height, - int src1Stride, int src2Stride, int dstStride) -@@ -1006,6 +1101,10 @@ static av_cold void rgb2rgb_init_c(void) - planar2x = planar2x_c; - ff_rgb24toyv12 = ff_rgb24toyv12_c; - ff_bgr24toyv12 = ff_bgr24toyv12_c; -+ ff_rgbxtoyv12 = ff_rgbxtoyv12_c; -+ ff_bgrxtoyv12 = ff_bgrxtoyv12_c; -+ ff_xrgbtoyv12 = ff_xrgbtoyv12_c; -+ ff_xbgrtoyv12 = ff_xbgrtoyv12_c; - interleaveBytes = interleaveBytes_c; - deinterleaveBytes = deinterleaveBytes_c; - vu9_to_vu12 = vu9_to_vu12_c; -diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c -index 9047030ae4..053c06adf5 100644 ---- a/libswscale/swscale_unscaled.c -+++ b/libswscale/swscale_unscaled.c -@@ -1671,6 +1671,74 @@ static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[], - return srcSliceH; - } - -+static int bgrxToYv12Wrapper(SwsContext *c, const uint8_t *src[], -+ int srcStride[], int srcSliceY, int srcSliceH, -+ uint8_t *dst[], int dstStride[]) -+{ -+ ff_bgrxtoyv12( -+ src[0], -+ dst[0] + srcSliceY * dstStride[0], -+ dst[1] + (srcSliceY >> 1) * dstStride[1], -+ dst[2] + (srcSliceY >> 1) * dstStride[2], -+ c->srcW, srcSliceH, -+ dstStride[0], dstStride[1], srcStride[0], -+ c->input_rgb2yuv_table); -+ if (dst[3]) -+ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); -+ return srcSliceH; -+} -+ -+static int rgbxToYv12Wrapper(SwsContext *c, const uint8_t *src[], -+ int srcStride[], int srcSliceY, int srcSliceH, -+ uint8_t *dst[], int dstStride[]) -+{ -+ ff_rgbxtoyv12( -+ src[0], -+ dst[0] + srcSliceY * dstStride[0], -+ dst[1] + (srcSliceY >> 1) * dstStride[1], -+ dst[2] + (srcSliceY >> 1) * dstStride[2], -+ c->srcW, srcSliceH, -+ dstStride[0], dstStride[1], srcStride[0], -+ c->input_rgb2yuv_table); -+ if (dst[3]) -+ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); -+ return srcSliceH; -+} -+ -+static int xbgrToYv12Wrapper(SwsContext *c, const uint8_t *src[], -+ int srcStride[], int srcSliceY, int srcSliceH, -+ uint8_t *dst[], int dstStride[]) -+{ -+ ff_xbgrtoyv12( -+ src[0], -+ dst[0] + srcSliceY * dstStride[0], -+ dst[1] + (srcSliceY >> 1) * dstStride[1], -+ dst[2] + (srcSliceY >> 1) * dstStride[2], -+ c->srcW, srcSliceH, -+ dstStride[0], dstStride[1], srcStride[0], -+ c->input_rgb2yuv_table); -+ if (dst[3]) -+ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); -+ return srcSliceH; -+} -+ -+static int xrgbToYv12Wrapper(SwsContext *c, const uint8_t *src[], -+ int srcStride[], int srcSliceY, int srcSliceH, -+ uint8_t *dst[], int dstStride[]) -+{ -+ ff_xrgbtoyv12( -+ src[0], -+ dst[0] + srcSliceY * dstStride[0], -+ dst[1] + (srcSliceY >> 1) * dstStride[1], -+ dst[2] + (srcSliceY >> 1) * dstStride[2], -+ c->srcW, srcSliceH, -+ dstStride[0], dstStride[1], srcStride[0], -+ c->input_rgb2yuv_table); -+ if (dst[3]) -+ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); -+ return srcSliceH; -+} -+ - static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[], - int srcStride[], int srcSliceY, int srcSliceH, - uint8_t *dst[], int dstStride[]) -@@ -2060,6 +2128,27 @@ void ff_get_unscaled_swscale(SwsContext *c) - !(flags & SWS_ACCURATE_RND) && !(dstW&1)) - c->convert_unscaled = rgb24ToYv12Wrapper; - -+ /* bgrxtoYV12 */ -+ if (((srcFormat == AV_PIX_FMT_BGRA && dstFormat == AV_PIX_FMT_YUV420P) || -+ (srcFormat == AV_PIX_FMT_BGR0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && -+ !(flags & SWS_ACCURATE_RND)) -+ c->convert_unscaled = bgrxToYv12Wrapper; -+ /* rgbx24toYV12 */ -+ if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) || -+ (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && -+ !(flags & SWS_ACCURATE_RND) && !(dstW&1)) -+ c->convert_unscaled = rgbxToYv12Wrapper; -+ /* xbgrtoYV12 */ -+ if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) || -+ (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && -+ !(flags & SWS_ACCURATE_RND) && !(dstW&1)) -+ c->convert_unscaled = xbgrToYv12Wrapper; -+ /* xrgb24toYV12 */ -+ if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) || -+ (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && -+ !(flags & SWS_ACCURATE_RND) && !(dstW&1)) -+ c->convert_unscaled = xrgbToYv12Wrapper; -+ - /* RGB/BGR -> RGB/BGR (no dither needed forms) */ - if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c) - && (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT)))) - -From b5672a2d361ec4f064ae116a3452282996cc87a0 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 20 Apr 2023 11:35:44 +0000 -Subject: [PATCH 125/136] swscale: Add aarch64 unscaled RGB24->YUV420P - -(cherry picked from commit 0cf416312095ce5bea3d2f7e9b14736d4b3ed160) ---- - libswscale/aarch64/rgb2rgb.c | 40 +++++++ - libswscale/aarch64/rgb2rgb_neon.S | 181 ++++++++++++++++++++++++++++++ - 2 files changed, 221 insertions(+) - -diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c -index a9bf6ff9e0..6d3e0000dc 100644 ---- a/libswscale/aarch64/rgb2rgb.c -+++ b/libswscale/aarch64/rgb2rgb.c -@@ -30,6 +30,44 @@ - void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2, - uint8_t *dest, int width, int height, - int src1Stride, int src2Stride, int dstStride); -+void ff_bgr24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -+ uint8_t *vdst, int width, int height, int lumStride, -+ int chromStride, int srcStride, int32_t *rgb2yuv); -+void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -+ uint8_t *vdst, int width, int height, int lumStride, -+ int chromStride, int srcStride, int32_t *rgb2yuv); -+ -+// RGB to YUV asm fns process 16 pixels at once so ensure that the output -+// will fit into the stride. ARM64 should cope with unaligned SIMD r/w so -+// don't test for that -+// Fall back to C if we cannot use asm -+ -+static inline int chkw(const int width, const int lumStride, const int chromStride) -+{ -+ const int aw = FFALIGN(width, 16); -+ return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2; -+} -+ -+static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -+ uint8_t *vdst, int width, int height, int lumStride, -+ int chromStride, int srcStride, int32_t *rgb2yuv) -+{ -+ if (chkw(width, lumStride, chromStride)) -+ ff_rgb24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv); -+ else -+ ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv); -+} -+ -+static void bgr24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -+ uint8_t *vdst, int width, int height, int lumStride, -+ int chromStride, int srcStride, int32_t *bgr2yuv) -+{ -+ if (chkw(width, lumStride, chromStride)) -+ ff_bgr24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv); -+ else -+ ff_bgr24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv); -+} -+ - - av_cold void rgb2rgb_init_aarch64(void) - { -@@ -37,5 +75,7 @@ av_cold void rgb2rgb_init_aarch64(void) - - if (have_neon(cpu_flags)) { - interleaveBytes = ff_interleave_bytes_neon; -+ ff_rgb24toyv12 = rgb24toyv12_check; -+ ff_bgr24toyv12 = bgr24toyv12_check; - } - } -diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S -index d81110ec57..8cf40b65f5 100644 ---- a/libswscale/aarch64/rgb2rgb_neon.S -+++ b/libswscale/aarch64/rgb2rgb_neon.S -@@ -77,3 +77,184 @@ function ff_interleave_bytes_neon, export=1 - 0: - ret - endfunc -+ -+// void ff_rgb24toyv12_aarch64( -+// const uint8_t *src, // x0 -+// uint8_t *ydst, // x1 -+// uint8_t *udst, // x2 -+// uint8_t *vdst, // x3 -+// int width, // w4 -+// int height, // w5 -+// int lumStride, // w6 -+// int chromStride, // w7 -+// int srcStr, // [sp, #0] -+// int32_t *rgb2yuv); // [sp, #8] -+ -+function ff_rgb24toyv12_aarch64, export=1 -+ ldr x15, [sp, #8] -+ ld1 {v3.s}[2], [x15], #4 -+ ld1 {v3.s}[1], [x15], #4 -+ ld1 {v3.s}[0], [x15], #4 -+ ld1 {v4.s}[2], [x15], #4 -+ ld1 {v4.s}[1], [x15], #4 -+ ld1 {v4.s}[0], [x15], #4 -+ ld1 {v5.s}[2], [x15], #4 -+ ld1 {v5.s}[1], [x15], #4 -+ ld1 {v5.s}[0], [x15] -+ b 99f -+endfunc -+ -+// void ff_bgr24toyv12_aarch64( -+// const uint8_t *src, // x0 -+// uint8_t *ydst, // x1 -+// uint8_t *udst, // x2 -+// uint8_t *vdst, // x3 -+// int width, // w4 -+// int height, // w5 -+// int lumStride, // w6 -+// int chromStride, // w7 -+// int srcStr, // [sp, #0] -+// int32_t *rgb2yuv); // [sp, #8] -+ -+function ff_bgr24toyv12_aarch64, export=1 -+ ldr x15, [sp, #8] -+ ld3 {v3.s, v4.s, v5.s}[0], [x15], #12 -+ ld3 {v3.s, v4.s, v5.s}[1], [x15], #12 -+ ld3 {v3.s, v4.s, v5.s}[2], [x15] -+99: -+ ldr w14, [sp, #0] -+ movi v18.8b, #128 -+ uxtl v17.8h, v18.8b -+ -+ // Even line - YUV -+1: -+ mov x10, x0 -+ mov x11, x1 -+ mov x12, x2 -+ mov x13, x3 -+ mov w9, w4 -+ -+0: -+ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 -+ -+ uxtl2 v20.8h, v0.16b -+ uxtl2 v21.8h, v1.16b -+ uxtl2 v22.8h, v2.16b -+ -+ uxtl v0.8h, v0.8b -+ uxtl v1.8h, v1.8b -+ uxtl v2.8h, v2.8b -+ // Y0 -+ smull v6.4s, v0.4h, v3.h[0] -+ smull2 v7.4s, v0.8h, v3.h[0] -+ smlal v6.4s, v1.4h, v4.h[0] -+ smlal2 v7.4s, v1.8h, v4.h[0] -+ smlal v6.4s, v2.4h, v5.h[0] -+ smlal2 v7.4s, v2.8h, v5.h[0] -+ shrn v6.4h, v6.4s, #12 -+ shrn2 v6.8h, v7.4s, #12 -+ add v6.8h, v6.8h, v17.8h // +128 (>> 3 = 16) -+ uqrshrn v16.8b, v6.8h, #3 -+ // Y1 -+ smull v6.4s, v20.4h, v3.h[0] -+ smull2 v7.4s, v20.8h, v3.h[0] -+ smlal v6.4s, v21.4h, v4.h[0] -+ smlal2 v7.4s, v21.8h, v4.h[0] -+ smlal v6.4s, v22.4h, v5.h[0] -+ smlal2 v7.4s, v22.8h, v5.h[0] -+ shrn v6.4h, v6.4s, #12 -+ shrn2 v6.8h, v7.4s, #12 -+ add v6.8h, v6.8h, v17.8h -+ uqrshrn2 v16.16b, v6.8h, #3 -+ // Y0/Y1 -+ st1 {v16.16b}, [x11], #16 -+ -+ uzp1 v0.8h, v0.8h, v20.8h -+ uzp1 v1.8h, v1.8h, v21.8h -+ uzp1 v2.8h, v2.8h, v22.8h -+ -+ // U -+ // Vector subscript *2 as we loaded into S but are only using H -+ smull v6.4s, v0.4h, v3.h[2] -+ smull2 v7.4s, v0.8h, v3.h[2] -+ smlal v6.4s, v1.4h, v4.h[2] -+ smlal2 v7.4s, v1.8h, v4.h[2] -+ smlal v6.4s, v2.4h, v5.h[2] -+ smlal2 v7.4s, v2.8h, v5.h[2] -+ shrn v6.4h, v6.4s, #14 -+ shrn2 v6.8h, v7.4s, #14 -+ sqrshrn v6.8b, v6.8h, #1 -+ add v6.8b, v6.8b, v18.8b // +128 -+ st1 {v6.8b}, [x12], #8 -+ -+ // V -+ smull v6.4s, v0.4h, v3.h[4] -+ smull2 v7.4s, v0.8h, v3.h[4] -+ smlal v6.4s, v1.4h, v4.h[4] -+ smlal2 v7.4s, v1.8h, v4.h[4] -+ smlal v6.4s, v2.4h, v5.h[4] -+ smlal2 v7.4s, v2.8h, v5.h[4] -+ shrn v6.4h, v6.4s, #14 -+ shrn2 v6.8h, v7.4s, #14 -+ sqrshrn v6.8b, v6.8h, #1 -+ add v6.8b, v6.8b, v18.8b // +128 -+ st1 {v6.8b}, [x13], #8 -+ -+ subs w9, w9, #16 -+ b.gt 0b -+ -+ // Odd line - Y only -+ -+ add x0, x0, w14, SXTX -+ add x1, x1, w6, SXTX -+ mov x10, x0 -+ mov x11, x1 -+ mov w9, w4 -+ -+0: -+ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 -+ -+ uxtl2 v20.8h, v0.16b -+ uxtl2 v21.8h, v1.16b -+ uxtl2 v22.8h, v2.16b -+ -+ uxtl v0.8h, v0.8b -+ uxtl v1.8h, v1.8b -+ uxtl v2.8h, v2.8b -+ // Y0 -+ smull v6.4s, v0.4h, v3.h[0] -+ smull2 v7.4s, v0.8h, v3.h[0] -+ smlal v6.4s, v1.4h, v4.h[0] -+ smlal2 v7.4s, v1.8h, v4.h[0] -+ smlal v6.4s, v2.4h, v5.h[0] -+ smlal2 v7.4s, v2.8h, v5.h[0] -+ shrn v6.4h, v6.4s, #12 -+ shrn2 v6.8h, v7.4s, #12 -+ add v6.8h, v6.8h, v17.8h -+ uqrshrn v16.8b, v6.8h, #3 -+ // Y1 -+ smull v6.4s, v20.4h, v3.h[0] -+ smull2 v7.4s, v20.8h, v3.h[0] -+ smlal v6.4s, v21.4h, v4.h[0] -+ smlal2 v7.4s, v21.8h, v4.h[0] -+ smlal v6.4s, v22.4h, v5.h[0] -+ smlal2 v7.4s, v22.8h, v5.h[0] -+ shrn v6.4h, v6.4s, #12 -+ shrn2 v6.8h, v7.4s, #12 -+ add v6.8h, v6.8h, v17.8h -+ uqrshrn2 v16.16b, v6.8h, #3 -+ // Y0/Y1 -+ st1 {v16.16b}, [x11], #16 -+ -+ subs w9, w9, #16 -+ b.gt 0b -+ -+ add x0, x0, w14, SXTX -+ add x1, x1, w6, SXTX -+ add x2, x2, w7, SXTX -+ add x3, x3, w7, SXTX -+ subs w5, w5, #2 -+ b.gt 1b -+ -+ ret -+endfunc - -From f62603136ee2eaf781519bd70e445b03f80960da Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 27 Apr 2023 13:03:52 +0000 -Subject: [PATCH 126/136] rgb2rgb: Fix rgb24->yuv420p with arbitrary wxh - -(cherry picked from commit 58771fdf0218dc670d8a343824f540e2f6e8785d) ---- - libswscale/aarch64/rgb2rgb.c | 5 +- - libswscale/aarch64/rgb2rgb_neon.S | 440 ++++++++++++++++++++++++------ - 2 files changed, 355 insertions(+), 90 deletions(-) - -diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c -index 6d3e0000dc..f10c4ef2de 100644 ---- a/libswscale/aarch64/rgb2rgb.c -+++ b/libswscale/aarch64/rgb2rgb.c -@@ -44,8 +44,9 @@ void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - - static inline int chkw(const int width, const int lumStride, const int chromStride) - { -- const int aw = FFALIGN(width, 16); -- return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2; -+// const int aw = FFALIGN(width, 16); -+// return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2; -+ return 1; - } - - static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S -index 8cf40b65f5..978ab443ea 100644 ---- a/libswscale/aarch64/rgb2rgb_neon.S -+++ b/libswscale/aarch64/rgb2rgb_neon.S -@@ -116,6 +116,25 @@ endfunc - // int srcStr, // [sp, #0] - // int32_t *rgb2yuv); // [sp, #8] - -+// regs -+// v0-2 Src bytes - reused as chroma src -+// v3-5 Coeffs (packed very inefficiently - could be squashed) -+// v6 128b -+// v7 128h -+// v8-15 Reserved -+// v16-18 Lo Src expanded as H -+// v19 - -+// v20-22 Hi Src expanded as H -+// v23 - -+// v24 U out -+// v25 U tmp -+// v26 Y out -+// v27-29 Y tmp -+// v30 V out -+// v31 V tmp -+ -+// Assumes Little Endian in tail stores & conversion matrix -+ - function ff_bgr24toyv12_aarch64, export=1 - ldr x15, [sp, #8] - ld3 {v3.s, v4.s, v5.s}[0], [x15], #12 -@@ -123,138 +142,383 @@ function ff_bgr24toyv12_aarch64, export=1 - ld3 {v3.s, v4.s, v5.s}[2], [x15] - 99: - ldr w14, [sp, #0] -- movi v18.8b, #128 -- uxtl v17.8h, v18.8b -- -- // Even line - YUV -+ movi v7.8b, #128 -+ uxtl v6.8h, v7.8b -+ // Ensure if nothing to do then we do nothing -+ cmp w4, #0 -+ b.le 90f -+ cmp w5, #0 -+ b.le 90f -+ // If w % 16 != 0 then -16 so we do main loop 1 fewer times with -+ // the remainder done in the tail -+ tst w4, #15 -+ b.eq 1f -+ sub w4, w4, #16 - 1: -+ -+// -------------------- Even line body - YUV -+11: -+ subs w9, w4, #0 - mov x10, x0 - mov x11, x1 - mov x12, x2 - mov x13, x3 -- mov w9, w4 -+ b.lt 12f - --0: - ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 -+ subs w9, w9, #16 -+ b.le 13f -+ -+10: -+ uxtl v16.8h, v0.8b -+ uxtl v17.8h, v1.8b -+ uxtl v18.8h, v2.8b - - uxtl2 v20.8h, v0.16b - uxtl2 v21.8h, v1.16b - uxtl2 v22.8h, v2.16b - -- uxtl v0.8h, v0.8b -- uxtl v1.8h, v1.8b -- uxtl v2.8h, v2.8b -+ bic v0.8h, #0xff, LSL #8 -+ bic v1.8h, #0xff, LSL #8 -+ bic v2.8h, #0xff, LSL #8 -+ -+ // Testing shows it is faster to stack the smull/smlal ops together -+ // rather than interleave them between channels and indeed even the -+ // shift/add sections seem happier not interleaved -+ - // Y0 -- smull v6.4s, v0.4h, v3.h[0] -- smull2 v7.4s, v0.8h, v3.h[0] -- smlal v6.4s, v1.4h, v4.h[0] -- smlal2 v7.4s, v1.8h, v4.h[0] -- smlal v6.4s, v2.4h, v5.h[0] -- smlal2 v7.4s, v2.8h, v5.h[0] -- shrn v6.4h, v6.4s, #12 -- shrn2 v6.8h, v7.4s, #12 -- add v6.8h, v6.8h, v17.8h // +128 (>> 3 = 16) -- uqrshrn v16.8b, v6.8h, #3 -+ smull v26.4s, v16.4h, v3.h[0] -+ smlal v26.4s, v17.4h, v4.h[0] -+ smlal v26.4s, v18.4h, v5.h[0] -+ smull2 v27.4s, v16.8h, v3.h[0] -+ smlal2 v27.4s, v17.8h, v4.h[0] -+ smlal2 v27.4s, v18.8h, v5.h[0] - // Y1 -- smull v6.4s, v20.4h, v3.h[0] -- smull2 v7.4s, v20.8h, v3.h[0] -- smlal v6.4s, v21.4h, v4.h[0] -- smlal2 v7.4s, v21.8h, v4.h[0] -- smlal v6.4s, v22.4h, v5.h[0] -- smlal2 v7.4s, v22.8h, v5.h[0] -- shrn v6.4h, v6.4s, #12 -- shrn2 v6.8h, v7.4s, #12 -- add v6.8h, v6.8h, v17.8h -- uqrshrn2 v16.16b, v6.8h, #3 -+ smull v28.4s, v20.4h, v3.h[0] -+ smlal v28.4s, v21.4h, v4.h[0] -+ smlal v28.4s, v22.4h, v5.h[0] -+ smull2 v29.4s, v20.8h, v3.h[0] -+ smlal2 v29.4s, v21.8h, v4.h[0] -+ smlal2 v29.4s, v22.8h, v5.h[0] -+ shrn v26.4h, v26.4s, #12 -+ shrn2 v26.8h, v27.4s, #12 -+ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) -+ uqrshrn v26.8b, v26.8h, #3 -+ shrn v28.4h, v28.4s, #12 -+ shrn2 v28.8h, v29.4s, #12 -+ add v28.8h, v28.8h, v6.8h -+ uqrshrn2 v26.16b, v28.8h, #3 - // Y0/Y1 -- st1 {v16.16b}, [x11], #16 -- -- uzp1 v0.8h, v0.8h, v20.8h -- uzp1 v1.8h, v1.8h, v21.8h -- uzp1 v2.8h, v2.8h, v22.8h - - // U - // Vector subscript *2 as we loaded into S but are only using H -- smull v6.4s, v0.4h, v3.h[2] -- smull2 v7.4s, v0.8h, v3.h[2] -- smlal v6.4s, v1.4h, v4.h[2] -- smlal2 v7.4s, v1.8h, v4.h[2] -- smlal v6.4s, v2.4h, v5.h[2] -- smlal2 v7.4s, v2.8h, v5.h[2] -- shrn v6.4h, v6.4s, #14 -- shrn2 v6.8h, v7.4s, #14 -- sqrshrn v6.8b, v6.8h, #1 -- add v6.8b, v6.8b, v18.8b // +128 -- st1 {v6.8b}, [x12], #8 -+ smull v24.4s, v0.4h, v3.h[2] -+ smlal v24.4s, v1.4h, v4.h[2] -+ smlal v24.4s, v2.4h, v5.h[2] -+ smull2 v25.4s, v0.8h, v3.h[2] -+ smlal2 v25.4s, v1.8h, v4.h[2] -+ smlal2 v25.4s, v2.8h, v5.h[2] - - // V -- smull v6.4s, v0.4h, v3.h[4] -- smull2 v7.4s, v0.8h, v3.h[4] -- smlal v6.4s, v1.4h, v4.h[4] -- smlal2 v7.4s, v1.8h, v4.h[4] -- smlal v6.4s, v2.4h, v5.h[4] -- smlal2 v7.4s, v2.8h, v5.h[4] -- shrn v6.4h, v6.4s, #14 -- shrn2 v6.8h, v7.4s, #14 -- sqrshrn v6.8b, v6.8h, #1 -- add v6.8b, v6.8b, v18.8b // +128 -- st1 {v6.8b}, [x13], #8 -+ smull v30.4s, v0.4h, v3.h[4] -+ smlal v30.4s, v1.4h, v4.h[4] -+ smlal v30.4s, v2.4h, v5.h[4] -+ smull2 v31.4s, v0.8h, v3.h[4] -+ smlal2 v31.4s, v1.8h, v4.h[4] -+ smlal2 v31.4s, v2.8h, v5.h[4] -+ -+ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 -+ -+ shrn v24.4h, v24.4s, #14 -+ shrn2 v24.8h, v25.4s, #14 -+ sqrshrn v24.8b, v24.8h, #1 -+ add v24.8b, v24.8b, v7.8b // +128 -+ shrn v30.4h, v30.4s, #14 -+ shrn2 v30.8h, v31.4s, #14 -+ sqrshrn v30.8b, v30.8h, #1 -+ add v30.8b, v30.8b, v7.8b // +128 - - subs w9, w9, #16 -- b.gt 0b - -- // Odd line - Y only -+ st1 {v26.16b}, [x11], #16 -+ st1 {v24.8b}, [x12], #8 -+ st1 {v30.8b}, [x13], #8 -+ -+ b.gt 10b -+ -+// -------------------- Even line tail - YUV -+// If width % 16 == 0 then simply runs once with preloaded RGB -+// If other then deals with preload & then does remaining tail -+ -+13: -+ // Body is simple copy of main loop body minus preload -+ -+ uxtl v16.8h, v0.8b -+ uxtl v17.8h, v1.8b -+ uxtl v18.8h, v2.8b -+ -+ uxtl2 v20.8h, v0.16b -+ uxtl2 v21.8h, v1.16b -+ uxtl2 v22.8h, v2.16b -+ -+ bic v0.8h, #0xff, LSL #8 -+ bic v1.8h, #0xff, LSL #8 -+ bic v2.8h, #0xff, LSL #8 -+ -+ // Y0 -+ smull v26.4s, v16.4h, v3.h[0] -+ smlal v26.4s, v17.4h, v4.h[0] -+ smlal v26.4s, v18.4h, v5.h[0] -+ smull2 v27.4s, v16.8h, v3.h[0] -+ smlal2 v27.4s, v17.8h, v4.h[0] -+ smlal2 v27.4s, v18.8h, v5.h[0] -+ // Y1 -+ smull v28.4s, v20.4h, v3.h[0] -+ smlal v28.4s, v21.4h, v4.h[0] -+ smlal v28.4s, v22.4h, v5.h[0] -+ smull2 v29.4s, v20.8h, v3.h[0] -+ smlal2 v29.4s, v21.8h, v4.h[0] -+ smlal2 v29.4s, v22.8h, v5.h[0] -+ shrn v26.4h, v26.4s, #12 -+ shrn2 v26.8h, v27.4s, #12 -+ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) -+ uqrshrn v26.8b, v26.8h, #3 -+ shrn v28.4h, v28.4s, #12 -+ shrn2 v28.8h, v29.4s, #12 -+ add v28.8h, v28.8h, v6.8h -+ uqrshrn2 v26.16b, v28.8h, #3 -+ // Y0/Y1 -+ -+ // U -+ // Vector subscript *2 as we loaded into S but are only using H -+ smull v24.4s, v0.4h, v3.h[2] -+ smlal v24.4s, v1.4h, v4.h[2] -+ smlal v24.4s, v2.4h, v5.h[2] -+ smull2 v25.4s, v0.8h, v3.h[2] -+ smlal2 v25.4s, v1.8h, v4.h[2] -+ smlal2 v25.4s, v2.8h, v5.h[2] - -+ // V -+ smull v30.4s, v0.4h, v3.h[4] -+ smlal v30.4s, v1.4h, v4.h[4] -+ smlal v30.4s, v2.4h, v5.h[4] -+ smull2 v31.4s, v0.8h, v3.h[4] -+ smlal2 v31.4s, v1.8h, v4.h[4] -+ smlal2 v31.4s, v2.8h, v5.h[4] -+ -+ cmp w9, #-16 -+ -+ shrn v24.4h, v24.4s, #14 -+ shrn2 v24.8h, v25.4s, #14 -+ sqrshrn v24.8b, v24.8h, #1 -+ add v24.8b, v24.8b, v7.8b // +128 -+ shrn v30.4h, v30.4s, #14 -+ shrn2 v30.8h, v31.4s, #14 -+ sqrshrn v30.8b, v30.8h, #1 -+ add v30.8b, v30.8b, v7.8b // +128 -+ -+ // Here: -+ // w9 == 0 width % 16 == 0, tail done -+ // w9 > -16 1st tail done (16 pels), remainder still to go -+ // w9 == -16 shouldn't happen -+ // w9 > -32 2nd tail done -+ // w9 <= -32 shouldn't happen -+ -+ b.lt 2f -+ st1 {v26.16b}, [x11], #16 -+ st1 {v24.8b}, [x12], #8 -+ st1 {v30.8b}, [x13], #8 -+ cbz w9, 3f -+ -+12: -+ sub w9, w9, #16 -+ -+ tbz w9, #3, 1f -+ ld3 {v0.8b, v1.8b, v2.8b}, [x10], #24 -+1: tbz w9, #2, 1f -+ ld3 {v0.b, v1.b, v2.b}[8], [x10], #3 -+ ld3 {v0.b, v1.b, v2.b}[9], [x10], #3 -+ ld3 {v0.b, v1.b, v2.b}[10], [x10], #3 -+ ld3 {v0.b, v1.b, v2.b}[11], [x10], #3 -+1: tbz w9, #1, 1f -+ ld3 {v0.b, v1.b, v2.b}[12], [x10], #3 -+ ld3 {v0.b, v1.b, v2.b}[13], [x10], #3 -+1: tbz w9, #0, 13b -+ ld3 {v0.b, v1.b, v2.b}[14], [x10], #3 -+ b 13b -+ -+2: -+ tbz w9, #3, 1f -+ st1 {v26.8b}, [x11], #8 -+ st1 {v24.s}[0], [x12], #4 -+ st1 {v30.s}[0], [x13], #4 -+1: tbz w9, #2, 1f -+ st1 {v26.s}[2], [x11], #4 -+ st1 {v24.h}[2], [x12], #2 -+ st1 {v30.h}[2], [x13], #2 -+1: tbz w9, #1, 1f -+ st1 {v26.h}[6], [x11], #2 -+ st1 {v24.b}[6], [x12], #1 -+ st1 {v30.b}[6], [x13], #1 -+1: tbz w9, #0, 1f -+ st1 {v26.b}[14], [x11] -+ st1 {v24.b}[7], [x12] -+ st1 {v30.b}[7], [x13] -+1: -+3: -+ -+// -------------------- Odd line body - Y only -+ -+ subs w5, w5, #1 -+ b.eq 90f -+ -+ subs w9, w4, #0 - add x0, x0, w14, SXTX - add x1, x1, w6, SXTX - mov x10, x0 - mov x11, x1 -- mov w9, w4 -+ b.lt 12f - --0: - ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 -+ subs w9, w9, #16 -+ b.le 13f -+ -+10: -+ uxtl v16.8h, v0.8b -+ uxtl v17.8h, v1.8b -+ uxtl v18.8h, v2.8b - - uxtl2 v20.8h, v0.16b - uxtl2 v21.8h, v1.16b - uxtl2 v22.8h, v2.16b - -- uxtl v0.8h, v0.8b -- uxtl v1.8h, v1.8b -- uxtl v2.8h, v2.8b -+ // Testing shows it is faster to stack the smull/smlal ops together -+ // rather than interleave them between channels and indeed even the -+ // shift/add sections seem happier not interleaved -+ - // Y0 -- smull v6.4s, v0.4h, v3.h[0] -- smull2 v7.4s, v0.8h, v3.h[0] -- smlal v6.4s, v1.4h, v4.h[0] -- smlal2 v7.4s, v1.8h, v4.h[0] -- smlal v6.4s, v2.4h, v5.h[0] -- smlal2 v7.4s, v2.8h, v5.h[0] -- shrn v6.4h, v6.4s, #12 -- shrn2 v6.8h, v7.4s, #12 -- add v6.8h, v6.8h, v17.8h -- uqrshrn v16.8b, v6.8h, #3 -+ smull v26.4s, v16.4h, v3.h[0] -+ smlal v26.4s, v17.4h, v4.h[0] -+ smlal v26.4s, v18.4h, v5.h[0] -+ smull2 v27.4s, v16.8h, v3.h[0] -+ smlal2 v27.4s, v17.8h, v4.h[0] -+ smlal2 v27.4s, v18.8h, v5.h[0] - // Y1 -- smull v6.4s, v20.4h, v3.h[0] -- smull2 v7.4s, v20.8h, v3.h[0] -- smlal v6.4s, v21.4h, v4.h[0] -- smlal2 v7.4s, v21.8h, v4.h[0] -- smlal v6.4s, v22.4h, v5.h[0] -- smlal2 v7.4s, v22.8h, v5.h[0] -- shrn v6.4h, v6.4s, #12 -- shrn2 v6.8h, v7.4s, #12 -- add v6.8h, v6.8h, v17.8h -- uqrshrn2 v16.16b, v6.8h, #3 -+ smull v28.4s, v20.4h, v3.h[0] -+ smlal v28.4s, v21.4h, v4.h[0] -+ smlal v28.4s, v22.4h, v5.h[0] -+ smull2 v29.4s, v20.8h, v3.h[0] -+ smlal2 v29.4s, v21.8h, v4.h[0] -+ smlal2 v29.4s, v22.8h, v5.h[0] -+ -+ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 -+ -+ shrn v26.4h, v26.4s, #12 -+ shrn2 v26.8h, v27.4s, #12 -+ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) -+ uqrshrn v26.8b, v26.8h, #3 -+ shrn v28.4h, v28.4s, #12 -+ shrn2 v28.8h, v29.4s, #12 -+ add v28.8h, v28.8h, v6.8h -+ uqrshrn2 v26.16b, v28.8h, #3 - // Y0/Y1 -- st1 {v16.16b}, [x11], #16 - - subs w9, w9, #16 -- b.gt 0b -+ -+ st1 {v26.16b}, [x11], #16 -+ -+ b.gt 10b -+ -+// -------------------- Odd line tail - Y -+// If width % 16 == 0 then simply runs once with preloaded RGB -+// If other then deals with preload & then does remaining tail -+ -+13: -+ // Body is simple copy of main loop body minus preload -+ -+ uxtl v16.8h, v0.8b -+ uxtl v17.8h, v1.8b -+ uxtl v18.8h, v2.8b -+ -+ uxtl2 v20.8h, v0.16b -+ uxtl2 v21.8h, v1.16b -+ uxtl2 v22.8h, v2.16b -+ -+ // Y0 -+ smull v26.4s, v16.4h, v3.h[0] -+ smlal v26.4s, v17.4h, v4.h[0] -+ smlal v26.4s, v18.4h, v5.h[0] -+ smull2 v27.4s, v16.8h, v3.h[0] -+ smlal2 v27.4s, v17.8h, v4.h[0] -+ smlal2 v27.4s, v18.8h, v5.h[0] -+ // Y1 -+ smull v28.4s, v20.4h, v3.h[0] -+ smlal v28.4s, v21.4h, v4.h[0] -+ smlal v28.4s, v22.4h, v5.h[0] -+ smull2 v29.4s, v20.8h, v3.h[0] -+ smlal2 v29.4s, v21.8h, v4.h[0] -+ smlal2 v29.4s, v22.8h, v5.h[0] -+ -+ cmp w9, #-16 -+ -+ shrn v26.4h, v26.4s, #12 -+ shrn2 v26.8h, v27.4s, #12 -+ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) -+ uqrshrn v26.8b, v26.8h, #3 -+ shrn v28.4h, v28.4s, #12 -+ shrn2 v28.8h, v29.4s, #12 -+ add v28.8h, v28.8h, v6.8h -+ uqrshrn2 v26.16b, v28.8h, #3 -+ // Y0/Y1 -+ -+ // Here: -+ // w9 == 0 width % 16 == 0, tail done -+ // w9 > -16 1st tail done (16 pels), remainder still to go -+ // w9 == -16 shouldn't happen -+ // w9 > -32 2nd tail done -+ // w9 <= -32 shouldn't happen -+ -+ b.lt 2f -+ st1 {v26.16b}, [x11], #16 -+ cbz w9, 3f -+ -+12: -+ sub w9, w9, #16 -+ -+ tbz w9, #3, 1f -+ ld3 {v0.8b, v1.8b, v2.8b}, [x10], #24 -+1: tbz w9, #2, 1f -+ ld3 {v0.b, v1.b, v2.b}[8], [x10], #3 -+ ld3 {v0.b, v1.b, v2.b}[9], [x10], #3 -+ ld3 {v0.b, v1.b, v2.b}[10], [x10], #3 -+ ld3 {v0.b, v1.b, v2.b}[11], [x10], #3 -+1: tbz w9, #1, 1f -+ ld3 {v0.b, v1.b, v2.b}[12], [x10], #3 -+ ld3 {v0.b, v1.b, v2.b}[13], [x10], #3 -+1: tbz w9, #0, 13b -+ ld3 {v0.b, v1.b, v2.b}[14], [x10], #3 -+ b 13b -+ -+2: -+ tbz w9, #3, 1f -+ st1 {v26.8b}, [x11], #8 -+1: tbz w9, #2, 1f -+ st1 {v26.s}[2], [x11], #4 -+1: tbz w9, #1, 1f -+ st1 {v26.h}[6], [x11], #2 -+1: tbz w9, #0, 1f -+ st1 {v26.b}[14], [x11] -+1: -+3: -+ -+// ------------------- Loop to start - - add x0, x0, w14, SXTX - add x1, x1, w6, SXTX - add x2, x2, w7, SXTX - add x3, x3, w7, SXTX -- subs w5, w5, #2 -- b.gt 1b -- -+ subs w5, w5, #1 -+ b.gt 11b -+90: - ret - endfunc - -From cf020c89ac47620c4a5390d0333e9ea70fbfa7b8 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 26 Apr 2023 15:36:07 +0000 -Subject: [PATCH 127/136] rgb2rgb: Use asm unconditionally - -(cherry picked from commit 7c216c0804836b31c0ea093bb1dde5ab387724b1) ---- - libswscale/aarch64/rgb2rgb.c | 37 ++---------------------------------- - 1 file changed, 2 insertions(+), 35 deletions(-) - -diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c -index f10c4ef2de..6a0e2dcc09 100644 ---- a/libswscale/aarch64/rgb2rgb.c -+++ b/libswscale/aarch64/rgb2rgb.c -@@ -37,46 +37,13 @@ void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - uint8_t *vdst, int width, int height, int lumStride, - int chromStride, int srcStride, int32_t *rgb2yuv); - --// RGB to YUV asm fns process 16 pixels at once so ensure that the output --// will fit into the stride. ARM64 should cope with unaligned SIMD r/w so --// don't test for that --// Fall back to C if we cannot use asm -- --static inline int chkw(const int width, const int lumStride, const int chromStride) --{ --// const int aw = FFALIGN(width, 16); --// return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2; -- return 1; --} -- --static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -- uint8_t *vdst, int width, int height, int lumStride, -- int chromStride, int srcStride, int32_t *rgb2yuv) --{ -- if (chkw(width, lumStride, chromStride)) -- ff_rgb24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv); -- else -- ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv); --} -- --static void bgr24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst, -- uint8_t *vdst, int width, int height, int lumStride, -- int chromStride, int srcStride, int32_t *bgr2yuv) --{ -- if (chkw(width, lumStride, chromStride)) -- ff_bgr24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv); -- else -- ff_bgr24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv); --} -- -- - av_cold void rgb2rgb_init_aarch64(void) - { - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - interleaveBytes = ff_interleave_bytes_neon; -- ff_rgb24toyv12 = rgb24toyv12_check; -- ff_bgr24toyv12 = bgr24toyv12_check; -+ ff_rgb24toyv12 = ff_rgb24toyv12_aarch64; -+ ff_bgr24toyv12 = ff_bgr24toyv12_aarch64; - } - } - -From 1895fdcaf403f403736ab52d1cb69dce7c964b66 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 27 Apr 2023 13:01:43 +0000 -Subject: [PATCH 128/136] tests/swscale: Add options for width and height on - the command line - -(cherry picked from commit eb8a09779688fc05bf204fdfcd063b04cda07271) ---- - libswscale/tests/swscale.c | 84 ++++++++++++++++++++++++++------------ - 1 file changed, 59 insertions(+), 25 deletions(-) - -diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c -index 6c38041ddb..4cf41d9f64 100644 ---- a/libswscale/tests/swscale.c -+++ b/libswscale/tests/swscale.c -@@ -355,56 +355,71 @@ static int fileTest(const uint8_t * const ref[4], int refStride[4], - return 0; - } - --#define W 96 --#define H 96 -- - int main(int argc, char **argv) - { -+ unsigned int W = 96; -+ unsigned int H = 96; -+ unsigned int W2; -+ unsigned int H2; -+ unsigned int S; - enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE; - enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE; -- uint8_t *rgb_data = av_malloc(W * H * 4); -- const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL }; -- int rgb_stride[4] = { 4 * W, 0, 0, 0 }; -- uint8_t *data = av_malloc(4 * W * H); -- const uint8_t * const src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 }; -- int stride[4] = { W, W, W, W }; - int x, y; - struct SwsContext *sws; - AVLFG rand; - int res = -1; - int i; - FILE *fp = NULL; -- -- if (!rgb_data || !data) -- return -1; -+ uint8_t *rgb_data; -+ uint8_t * rgb_src[4] = { NULL }; -+ int rgb_stride[4] = { 0 }; -+ uint8_t *data; -+ uint8_t * src[4] = { NULL }; -+ int stride[4] = { 0 }; - - for (i = 1; i < argc; i += 2) { -+ const char * const arg2 = argv[i+1]; -+ - if (argv[i][0] != '-' || i + 1 == argc) - goto bad_option; - if (!strcmp(argv[i], "-ref")) { -- fp = fopen(argv[i + 1], "r"); -+ fp = fopen(arg2, "r"); - if (!fp) { -- fprintf(stderr, "could not open '%s'\n", argv[i + 1]); -+ fprintf(stderr, "could not open '%s'\n", arg2); - goto error; - } - } else if (!strcmp(argv[i], "-cpuflags")) { - unsigned flags = av_get_cpu_flags(); -- int ret = av_parse_cpu_caps(&flags, argv[i + 1]); -+ int ret = av_parse_cpu_caps(&flags, arg2); - if (ret < 0) { -- fprintf(stderr, "invalid cpu flags %s\n", argv[i + 1]); -+ fprintf(stderr, "invalid cpu flags %s\n", arg2); - return ret; - } - av_force_cpu_flags(flags); - } else if (!strcmp(argv[i], "-src")) { -- srcFormat = av_get_pix_fmt(argv[i + 1]); -+ srcFormat = av_get_pix_fmt(arg2); - if (srcFormat == AV_PIX_FMT_NONE) { -- fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]); -+ fprintf(stderr, "invalid pixel format %s\n", arg2); - return -1; - } - } else if (!strcmp(argv[i], "-dst")) { -- dstFormat = av_get_pix_fmt(argv[i + 1]); -+ dstFormat = av_get_pix_fmt(arg2); - if (dstFormat == AV_PIX_FMT_NONE) { -- fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]); -+ fprintf(stderr, "invalid pixel format %s\n", arg2); -+ return -1; -+ } -+ } else if (!strcmp(argv[i], "-w")) { -+ char * p = NULL; -+ W = strtoul(arg2, &p, 0); -+ if (!W || *p) { -+ fprintf(stderr, "bad width %s\n", arg2); -+ return -1; -+ } -+ } else if (!strcmp(argv[i], "-h")) { -+ char * p = NULL; -+ H = strtoul(arg2, &p, 0); -+ if (!H || *p) { -+ fprintf(stderr, "bad height '%s' (H=%d, *p=%d)\n", arg2, H, *p); - return -1; - } - } else { -@@ -414,15 +429,34 @@ bad_option: - } - } - -- sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H, -+ S = (W + 15) & ~15; -+ rgb_data = av_mallocz(S * H * 4); -+ rgb_src[0] = rgb_data; -+ rgb_stride[0] = 4 * S; -+ data = av_mallocz(4 * S * H); -+ src[0] = data; -+ src[1] = data + S * H; -+ src[2] = data + S * H * 2; -+ src[3] = data + S * H * 3; -+ stride[0] = S; -+ stride[1] = S; -+ stride[2] = S; -+ stride[3] = S; -+ H2 = H < 96 ? 8 : H / 12; -+ W2 = W < 96 ? 8 : W / 12; -+ -+ if (!rgb_data || !data) -+ return -1; -+ -+ sws = sws_getContext(W2, H2, AV_PIX_FMT_RGB32, W, H, - AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL); - - av_lfg_init(&rand, 1); - - for (y = 0; y < H; y++) - for (x = 0; x < W * 4; x++) -- rgb_data[ x + y * 4 * W] = av_lfg_get(&rand); -- res = sws_scale(sws, rgb_src, rgb_stride, 0, H / 12, (uint8_t * const *) src, stride); -+ rgb_data[ x + y * 4 * S] = av_lfg_get(&rand); -+ res = sws_scale(sws, (const uint8_t * const *)rgb_src, rgb_stride, 0, H2, (uint8_t * const *) src, stride); - if (res < 0 || res != H) { - res = -1; - goto error; -@@ -431,10 +465,10 @@ bad_option: - av_free(rgb_data); - - if(fp) { -- res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat); -+ res = fileTest((const uint8_t * const *)src, stride, W, H, fp, srcFormat, dstFormat); - fclose(fp); - } else { -- selfTest(src, stride, W, H, srcFormat, dstFormat); -+ selfTest((const uint8_t * const *)src, stride, W, H, srcFormat, dstFormat); - res = 0; - } - error: - -From 94e48653a6bd1b8438887b486927e87b56651455 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 26 Apr 2023 16:31:23 +0000 -Subject: [PATCH 129/136] tests/swscale: Add a timing option - --t Where n is the number of time to loop the scale op. - Often useful to do it 10 times or so for better resolution - -(cherry picked from commit 50cd60a23a66254f911376602d07b30fcafbde96) ---- - libswscale/tests/swscale.c | 32 ++++++++++++++++++++++++++++++-- - 1 file changed, 30 insertions(+), 2 deletions(-) - -diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c -index 4cf41d9f64..12776ffec7 100644 ---- a/libswscale/tests/swscale.c -+++ b/libswscale/tests/swscale.c -@@ -23,6 +23,7 @@ - #include - #include - #include -+#include - - #undef HAVE_AV_CONFIG_H - #include "libavutil/cpu.h" -@@ -78,6 +79,15 @@ struct Results { - uint32_t crc; - }; - -+static int time_rep = 0; -+ -+static uint64_t utime(void) -+{ -+ struct timespec ts; -+ clock_gettime(CLOCK_MONOTONIC, &ts); -+ return ts.tv_nsec / 1000 + (uint64_t)ts.tv_sec * 1000000; -+} -+ - // test by ref -> src -> dst -> out & compare out against ref - // ref & out are YV12 - static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h, -@@ -174,7 +184,7 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h, - goto end; - } - -- printf(" %s %dx%d -> %s %3dx%3d flags=%2d", -+ printf(" %s %4dx%4d -> %s %4dx%4d flags=%2d", - desc_src->name, srcW, srcH, - desc_dst->name, dstW, dstH, - flags); -@@ -182,6 +192,17 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h, - - sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride); - -+ if (time_rep != 0) -+ { -+ const uint64_t now = utime(); -+ uint64_t done; -+ for (i = 1; i != time_rep; ++i) { -+ sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride); -+ } -+ done = utime(); -+ printf(" T=%7"PRId64"us ", done-now); -+ } -+ - for (i = 0; i < 4 && dstStride[i]; i++) - crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i], - dstStride[i] * dstH); -@@ -419,7 +440,14 @@ int main(int argc, char **argv) - char * p = NULL; - H = strtoul(arg2, &p, 0); - if (!H || *p) { -- fprintf(stderr, "bad height '%s' (H=%d, *p=%d)\n", arg2, H, *p); -+ fprintf(stderr, "bad height '%s'\n", arg2); -+ return -1; -+ } -+ } else if (!strcmp(argv[i], "-t")) { -+ char * p = NULL; -+ time_rep = (int)strtol(arg2, &p, 0); -+ if (*p) { -+ fprintf(stderr, "bad time repetitions '%s'\n", arg2); - return -1; - } - } else { - -From 406806d0b9d9cb113deb0d083a28cbccabab6825 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 20 Apr 2023 13:40:36 +0000 -Subject: [PATCH 130/136] swscale: RGB->YUV420 fix C template to allow odd - widths - -(cherry picked from commit 08b2023e7b5292df0adc6593e4d20087f9cef5c8) ---- - libswscale/rgb2rgb_template.c | 44 +++++++++++++++++++++++++++++++++++ - libswscale/swscale_unscaled.c | 11 ++++----- - 2 files changed, 49 insertions(+), 6 deletions(-) - -diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c -index 703de90690..e711589e1e 100644 ---- a/libswscale/rgb2rgb_template.c -+++ b/libswscale/rgb2rgb_template.c -@@ -679,6 +679,19 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; - ydst[2 * i + 1] = Y; - } -+ if ((width & 1) != 0) { -+ unsigned int b = src[6 * i + 0]; -+ unsigned int g = src[6 * i + 1]; -+ unsigned int r = src[6 * i + 2]; -+ -+ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; -+ unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128; -+ unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128; -+ -+ udst[i] = U; -+ vdst[i] = V; -+ ydst[2 * i] = Y; -+ } - ydst += lumStride; - src += srcStride; - -@@ -701,6 +714,15 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; - ydst[2 * i + 1] = Y; - } -+ if ((width & 1) != 0) { -+ unsigned int b = src[6 * i + 0]; -+ unsigned int g = src[6 * i + 1]; -+ unsigned int r = src[6 * i + 2]; -+ -+ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; -+ -+ ydst[2 * i] = Y; -+ } - udst += chromStride; - vdst += chromStride; - ydst += lumStride; -@@ -767,6 +789,19 @@ static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; - ydst[2 * i + 1] = Y; - } -+ if ((width & 1) != 0) { -+ unsigned int b = src[8 * i + 2]; -+ unsigned int g = src[8 * i + 1]; -+ unsigned int r = src[8 * i + 0]; -+ -+ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; -+ unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128; -+ unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128; -+ -+ udst[i] = U; -+ vdst[i] = V; -+ ydst[2 * i] = Y; -+ } - ydst += lumStride; - src += srcStride; - -@@ -789,6 +824,15 @@ static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; - ydst[2 * i + 1] = Y; - } -+ if ((width & 1) != 0) { -+ unsigned int b = src[8 * i + 2]; -+ unsigned int g = src[8 * i + 1]; -+ unsigned int r = src[8 * i + 0]; -+ -+ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; -+ -+ ydst[2 * i] = Y; -+ } - udst += chromStride; - vdst += chromStride; - ydst += lumStride; -diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c -index 053c06adf5..52469b2e4a 100644 ---- a/libswscale/swscale_unscaled.c -+++ b/libswscale/swscale_unscaled.c -@@ -2062,7 +2062,6 @@ void ff_get_unscaled_swscale(SwsContext *c) - const enum AVPixelFormat dstFormat = c->dstFormat; - const int flags = c->flags; - const int dstH = c->dstH; -- const int dstW = c->dstW; - int needsDither; - - needsDither = isAnyRGB(dstFormat) && -@@ -2120,12 +2119,12 @@ void ff_get_unscaled_swscale(SwsContext *c) - /* bgr24toYV12 */ - if (srcFormat == AV_PIX_FMT_BGR24 && - (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) && -- !(flags & SWS_ACCURATE_RND) && !(dstW&1)) -+ !(flags & SWS_ACCURATE_RND)) - c->convert_unscaled = bgr24ToYv12Wrapper; - /* rgb24toYV12 */ - if (srcFormat == AV_PIX_FMT_RGB24 && - (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) && -- !(flags & SWS_ACCURATE_RND) && !(dstW&1)) -+ !(flags & SWS_ACCURATE_RND)) - c->convert_unscaled = rgb24ToYv12Wrapper; - - /* bgrxtoYV12 */ -@@ -2136,17 +2135,17 @@ void ff_get_unscaled_swscale(SwsContext *c) - /* rgbx24toYV12 */ - if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) || - (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && -- !(flags & SWS_ACCURATE_RND) && !(dstW&1)) -+ !(flags & SWS_ACCURATE_RND)) - c->convert_unscaled = rgbxToYv12Wrapper; - /* xbgrtoYV12 */ - if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) || - (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && -- !(flags & SWS_ACCURATE_RND) && !(dstW&1)) -+ !(flags & SWS_ACCURATE_RND)) - c->convert_unscaled = xbgrToYv12Wrapper; - /* xrgb24toYV12 */ - if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) || - (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && -- !(flags & SWS_ACCURATE_RND) && !(dstW&1)) -+ !(flags & SWS_ACCURATE_RND)) - c->convert_unscaled = xrgbToYv12Wrapper; - - /* RGB/BGR -> RGB/BGR (no dither needed forms) */ - -From 68c6482d9473ce774e87cac2455a8c7b3e2d99b4 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Thu, 4 May 2023 14:26:14 +0000 -Subject: [PATCH 131/136] rtpenc: Add code to send H264 new extradata in - sidedata - -Fixes issue with pi V4L2 H264 encode which cannot create extradata -at init time. - -(cherry picked from commit 4f852b4b093f841b64b4934a6f1720e98e4e0f2c) ---- - libavformat/rtpenc.c | 18 ++++++++++++++++++ - 1 file changed, 18 insertions(+) - -diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c -index a8d296a154..f67dc2a15a 100644 ---- a/libavformat/rtpenc.c -+++ b/libavformat/rtpenc.c -@@ -19,6 +19,7 @@ - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -+#include "avc.h" - #include "avformat.h" - #include "mpegts.h" - #include "internal.h" -@@ -585,8 +586,25 @@ static int rtp_write_packet(AVFormatContext *s1, AVPacket *pkt) - ff_rtp_send_vc2hq(s1, pkt->data, size, st->codecpar->field_order != AV_FIELD_PROGRESSIVE ? 1 : 0); - break; - case AV_CODEC_ID_H264: -+ { -+ uint8_t *side_data; -+ int side_data_size = 0; -+ -+ side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, -+ &side_data_size); -+ -+ if (side_data_size != 0) { -+ int ps_size = side_data_size; -+ uint8_t * ps_buf = NULL; -+ -+ ff_avc_write_annexb_extradata(side_data, &ps_buf, &ps_size); -+ av_log(s1, AV_LOG_TRACE, "H264: write side data=%d\n", ps_size); -+ ff_rtp_send_h264_hevc(s1, ps_buf ? ps_buf : side_data, ps_size); -+ av_free(ps_buf); -+ } - ff_rtp_send_h264_hevc(s1, pkt->data, size); - break; -+ } - case AV_CODEC_ID_H261: - ff_rtp_send_h261(s1, pkt->data, size); - break; - -From 5240cc7fc3abed8af5f178c5461ca9fe11a7d5e4 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Mon, 5 Jun 2023 08:34:38 +0000 -Subject: [PATCH 132/136] rgb2rgb: Fix luma narrow+saturation instruction - -(cherry picked from commit 9cdac1c08ad5c0aea28907d1d3fd0bdda387955a) ---- - libswscale/aarch64/rgb2rgb_neon.S | 16 ++++++++-------- - 1 file changed, 8 insertions(+), 8 deletions(-) - -diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S -index 978ab443ea..476ca723a0 100644 ---- a/libswscale/aarch64/rgb2rgb_neon.S -+++ b/libswscale/aarch64/rgb2rgb_neon.S -@@ -203,11 +203,11 @@ function ff_bgr24toyv12_aarch64, export=1 - shrn v26.4h, v26.4s, #12 - shrn2 v26.8h, v27.4s, #12 - add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) -- uqrshrn v26.8b, v26.8h, #3 -+ sqrshrun v26.8b, v26.8h, #3 - shrn v28.4h, v28.4s, #12 - shrn2 v28.8h, v29.4s, #12 - add v28.8h, v28.8h, v6.8h -- uqrshrn2 v26.16b, v28.8h, #3 -+ sqrshrun2 v26.16b, v28.8h, #3 - // Y0/Y1 - - // U -@@ -282,11 +282,11 @@ function ff_bgr24toyv12_aarch64, export=1 - shrn v26.4h, v26.4s, #12 - shrn2 v26.8h, v27.4s, #12 - add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) -- uqrshrn v26.8b, v26.8h, #3 -+ sqrshrun v26.8b, v26.8h, #3 - shrn v28.4h, v28.4s, #12 - shrn2 v28.8h, v29.4s, #12 - add v28.8h, v28.8h, v6.8h -- uqrshrn2 v26.16b, v28.8h, #3 -+ sqrshrun2 v26.16b, v28.8h, #3 - // Y0/Y1 - - // U -@@ -416,11 +416,11 @@ function ff_bgr24toyv12_aarch64, export=1 - shrn v26.4h, v26.4s, #12 - shrn2 v26.8h, v27.4s, #12 - add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) -- uqrshrn v26.8b, v26.8h, #3 -+ sqrshrun v26.8b, v26.8h, #3 - shrn v28.4h, v28.4s, #12 - shrn2 v28.8h, v29.4s, #12 - add v28.8h, v28.8h, v6.8h -- uqrshrn2 v26.16b, v28.8h, #3 -+ sqrshrun2 v26.16b, v28.8h, #3 - // Y0/Y1 - - subs w9, w9, #16 -@@ -464,11 +464,11 @@ function ff_bgr24toyv12_aarch64, export=1 - shrn v26.4h, v26.4s, #12 - shrn2 v26.8h, v27.4s, #12 - add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) -- uqrshrn v26.8b, v26.8h, #3 -+ sqrshrun v26.8b, v26.8h, #3 - shrn v28.4h, v28.4s, #12 - shrn2 v28.8h, v29.4s, #12 - add v28.8h, v28.8h, v6.8h -- uqrshrn2 v26.16b, v28.8h, #3 -+ sqrshrun2 v26.16b, v28.8h, #3 - // Y0/Y1 - - // Here: - -From 9474d9d227f2af488d5d2bd614c5c707479ca3c3 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Sun, 4 Jun 2023 13:37:59 +0000 -Subject: [PATCH 133/136] v4l2_m2m_dec: Tweak pending count to use dts & - reorder size - -(cherry picked from commit ca438b382c90f9a5f58f4708205e6ac25395db2a) ---- - libavcodec/v4l2_m2m.h | 1 + - libavcodec/v4l2_m2m_dec.c | 53 +++++++++++++++++++++++++++++++-------- - 2 files changed, 43 insertions(+), 11 deletions(-) - -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index ded1478a49..a506e69d67 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -115,6 +115,7 @@ typedef struct V4L2m2mContext { - - /* req pkt */ - int req_pkt; -+ int reorder_size; - - /* Ext data sent */ - int extdata_sent; -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index d124c7b1fc..13af62e819 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -121,13 +121,18 @@ log_dump(void * logctx, int lvl, const void * const data, const size_t len) - } - #endif - --static int64_t pts_stats_guess(const pts_stats_t * const stats) -+static unsigned int pts_stats_interval(const pts_stats_t * const stats) -+{ -+ return stats->last_interval; -+} -+ -+static int64_t pts_stats_guess(const pts_stats_t * const stats, const int fail_bad_guess) - { - if (stats->last_count <= 1) - return stats->last_pts; - if (stats->last_pts == AV_NOPTS_VALUE || -- stats->last_interval == 0 || -- stats->last_count >= STATS_LAST_COUNT_MAX) -+ fail_bad_guess && (stats->last_interval == 0 || -+ stats->last_count >= STATS_LAST_COUNT_MAX)) - return AV_NOPTS_VALUE; - return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval; - } -@@ -345,7 +350,7 @@ set_best_effort_pts(AVCodecContext *const avctx, - { - pts_stats_add(ps, frame->pts); - -- frame->best_effort_timestamp = pts_stats_guess(ps); -+ frame->best_effort_timestamp = pts_stats_guess(ps, 1); - // If we can't guess from just PTS - try DTS - if (frame->best_effort_timestamp == AV_NOPTS_VALUE) - frame->best_effort_timestamp = frame->pkt_dts; -@@ -380,15 +385,25 @@ xlat_init(xlat_track_t * const x) - } - - static int --xlat_pending(const xlat_track_t * const x) -+xlat_pending(const V4L2m2mContext * const s) - { -+ const xlat_track_t *const x = &s->xlat; - unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE; - int i; -- const int64_t now = x->last_pts; -+ const int64_t now = pts_stats_guess(&s->pts_stat, 0); -+ int64_t first_dts = AV_NOPTS_VALUE; -+ int no_dts_count = 0; -+ unsigned int interval = pts_stats_interval(&s->pts_stat); - - for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) { - const V4L2m2mTrackEl * const t = x->track_els + n; - -+ if (first_dts == AV_NOPTS_VALUE) -+ if (t->dts == AV_NOPTS_VALUE) -+ ++no_dts_count; ++ for (int j = 0; j < a->height; ++j) ++ for (int i = 0; i < a->width; ++i) { ++ float *p = a->d + j * a->width + i; ++ *p *= 64; ++ if (a->height == 4) ++ *p /= (const unsigned[]) { 289, 292, 289, 292 } [j]; + else -+ first_dts = t->dts; -+ - // Discard only set on never-set or flushed entries - // So if we get here we've never successfully decoded a frame so allow - // more frames into the buffer before stalling -@@ -408,6 +423,18 @@ xlat_pending(const xlat_track_t * const x) - break; - } - -+ if (first_dts != AV_NOPTS_VALUE && now != AV_NOPTS_VALUE && interval != 0 && s->reorder_size != 0) { -+ const int iframes = (first_dts - now) / (int)interval; -+ const int t = iframes - s->reorder_size + no_dts_count; -+ -+// av_log(s->avctx, AV_LOG_DEBUG, "Last:%"PRId64", Now:%"PRId64", First:%"PRId64", delta=%"PRId64", frames=%d, nodts=%d\n", -+// x->last_dts, now, first_dts, first_dts - now, iframes, no_dts_count); -+ -+ if (iframes > 0 && iframes < 64 && t < i) { -+ return t; ++ *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j]; ++ if (a->width == 4) ++ *p /= (const unsigned[]) { 289, 292, 289, 292 } [i]; ++ else ++ *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i]; + } -+ } -+ - return i; - } - -@@ -585,12 +612,12 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx) - static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) - { - V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; -- int src_rv = NQ_OK; -+ int src_rv = -1; - int dst_rv = 1; // Non-zero (done), non-negative (error) number - unsigned int i = 0; - - do { -- const int pending = xlat_pending(&s->xlat); -+ const int pending = xlat_pending(s); - const int prefer_dq = (pending > 4); - const int last_src_rv = src_rv; - -@@ -966,8 +993,10 @@ static uint32_t max_coded_size(const AVCodecContext * const avctx) - } - - static void --parse_extradata(AVCodecContext *avctx) -+parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s) - { -+ s->reorder_size = 0; -+ - if (!avctx->extradata || !avctx->extradata_size) - return; - -@@ -996,6 +1025,7 @@ parse_extradata(AVCodecContext *avctx) - avctx->profile = ff_h264_get_profile(sps); - avctx->level = sps->level_idc; - } -+ s->reorder_size = sps->num_reorder_frames; - } - ff_h264_ps_uninit(&ps); - break; -@@ -1025,6 +1055,7 @@ parse_extradata(AVCodecContext *avctx) - if (sps) { - avctx->profile = sps->ptl.general_ptl.profile_idc; - avctx->level = sps->ptl.general_ptl.level_idc; -+ s->reorder_size = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering; - } - } - ff_hevc_ps_uninit(&ps); -@@ -1057,12 +1088,12 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - avctx->ticks_per_frame = 2; - } - -- parse_extradata(avctx); -- - ret = ff_v4l2_m2m_create_context(priv, &s); - if (ret < 0) - return ret; - -+ parse_extradata(avctx, s); -+ - xlat_init(&s->xlat); - pts_stats_init(&s->pts_stat, avctx, "decoder"); - - -From 2145b9c9177f0fe9569ce39e2d4eb629caf8bd47 Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Wed, 7 Jun 2023 11:14:52 +0000 -Subject: [PATCH 134/136] v4l2_m2m: Add encode size check - -Previously an out of bounds size would fail whilst trying to copy the -buffer with an unhelpful message. This produces a better error at init -time. - -(cherry picked from commit 0b61c4617e26f043d28d44c8767f7b9fd4882f97) ---- - libavcodec/v4l2_m2m.c | 43 +++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 43 insertions(+) - -diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c -index f802687b1b..28d9ed4988 100644 ---- a/libavcodec/v4l2_m2m.c -+++ b/libavcodec/v4l2_m2m.c -@@ -109,6 +109,44 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe) - return AVERROR(EINVAL); - } - -+static int check_size(AVCodecContext * const avctx, V4L2m2mContext * const s) -+{ -+ struct v4l2_format fmt = {.type = s->output.type}; -+ int rv; -+ uint32_t pixfmt = ff_v4l2_format_avfmt_to_v4l2(avctx->pix_fmt); -+ unsigned int w; -+ unsigned int h; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) { -+ fmt.fmt.pix_mp.pixelformat = pixfmt; -+ fmt.fmt.pix_mp.width = avctx->width; -+ fmt.fmt.pix_mp.height = avctx->height; -+ } -+ else { -+ fmt.fmt.pix.pixelformat = pixfmt; -+ fmt.fmt.pix.width = avctx->width; -+ fmt.fmt.pix.height = avctx->height; -+ } -+ -+ rv = ioctl(s->fd, VIDIOC_TRY_FMT, &fmt); -+ -+ if (rv != 0) { -+ rv = AVERROR(errno); -+ av_log(avctx, AV_LOG_ERROR, "%s: Tryfmt failed: %s\n", __func__, av_err2str(rv)); -+ return rv; -+ } -+ -+ w = ff_v4l2_get_format_width(&fmt); -+ h = ff_v4l2_get_format_height(&fmt); -+ -+ if (w < avctx->width || h < avctx->height) { -+ av_log(avctx, AV_LOG_WARNING, "%s: Size check failed: asked for %dx%d, got: %dx%d\n", __func__, avctx->width, avctx->height, w, h); -+ return AVERROR(EINVAL); -+ } -+ -+ return 0; +} + - static int v4l2_probe_driver(V4L2m2mContext *s) - { - void *log_ctx = s->avctx; -@@ -128,6 +166,11 @@ static int v4l2_probe_driver(V4L2m2mContext *s) - goto done; - } - -+ // If being given frames (encode) check that V4L2 can cope with the size -+ if (s->output.av_codec_id == AV_CODEC_ID_RAWVIDEO && -+ (ret = check_size(s->avctx, s)) != 0) -+ goto done; ++static void divide_and_round_nearest(matrix *a, float by) ++{ ++ for (int j = 0; j < a->height; ++j) ++ for (int i = 0; i < a->width; ++i) { ++ float *p = a->d + j * a->width + i; ++ *p = rintf(*p / by); ++ } ++} + - ret = ff_v4l2_context_get_format(&s->capture, 1); - if (ret) { - av_log(log_ctx, AV_LOG_DEBUG, "v4l2 capture format not supported\n"); - -From 805985ea191c98885a74dbf994b1ca11551cd81e Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Fri, 9 Jun 2023 10:28:12 +0000 -Subject: [PATCH 135/136] vf_bwdif: Add attributes to ask for vectorization - -(cherry picked from commit 281250290ba5c2dcd8676e9a261050e65c10bcb7) ---- - libavfilter/vf_bwdif.c | 29 +++++++++++++++-------------- - 1 file changed, 15 insertions(+), 14 deletions(-) - -diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c -index 65c617ebb3..09e68523bb 100644 ---- a/libavfilter/vf_bwdif.c -+++ b/libavfilter/vf_bwdif.c -@@ -74,10 +74,10 @@ typedef struct ThreadData { - int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e)) >> 1; \ - int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e)) >> 1; \ - int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \ -- \ -+ {/*\ - if (!diff) { \ - dst[0] = d; \ -- } else { -+ } else {*/ - - #define SPAT_CHECK() \ - int b = ((prev2[mrefs2] + next2[mrefs2]) >> 1) - c; \ -@@ -89,15 +89,16 @@ typedef struct ThreadData { - diff = FFMAX3(diff, min, -max); - - #define FILTER_LINE() \ -+ int i1, i2; \ - SPAT_CHECK() \ -- if (FFABS(c - e) > temporal_diff0) { \ -- interpol = (((coef_hf[0] * (prev2[0] + next2[0]) \ -+ /*if (FFABS(c - e) > temporal_diff0)*/ { \ -+ i1 = (((coef_hf[0] * (prev2[0] + next2[0]) \ - - coef_hf[1] * (prev2[mrefs2] + next2[mrefs2] + prev2[prefs2] + next2[prefs2]) \ - + coef_hf[2] * (prev2[mrefs4] + next2[mrefs4] + prev2[prefs4] + next2[prefs4])) >> 2) \ - + coef_lf[0] * (c + e) - coef_lf[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \ -- } else { \ -- interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \ -- } -+ } /*else*/ { \ -+ i2 = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \ -+ }interpol = FFABS(c - e) > temporal_diff0 ? i1:i2;\ - - #define FILTER_EDGE() \ - if (spat) { \ -@@ -111,7 +112,7 @@ typedef struct ThreadData { - else if (interpol < d - diff) \ - interpol = d - diff; \ - \ -- dst[0] = av_clip(interpol, 0, clip_max); \ -+ dst[0] = !diff ? d : av_clip(interpol, 0, clip_max); \ - } \ - \ - dst++; \ -@@ -122,7 +123,7 @@ typedef struct ThreadData { - next2++; \ - } - --static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs, -+static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs, - int prefs3, int mrefs3, int parity, int clip_max) - { - uint8_t *dst = dst1; -@@ -132,7 +133,7 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs, - FILTER_INTRA() - } - --static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, -+static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, - int w, int prefs, int mrefs, int prefs2, int mrefs2, - int prefs3, int mrefs3, int prefs4, int mrefs4, - int parity, int clip_max) -@@ -150,7 +151,7 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, - FILTER2() - } - --static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1, -+static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, - int w, int prefs, int mrefs, int prefs2, int mrefs2, - int parity, int clip_max, int spat) - { -@@ -167,7 +168,7 @@ static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1, - FILTER2() - } - --static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mrefs, -+static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs, - int prefs3, int mrefs3, int parity, int clip_max) - { - uint16_t *dst = dst1; -@@ -177,7 +178,7 @@ static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mre - FILTER_INTRA() - } - --static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1, -+static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, - int w, int prefs, int mrefs, int prefs2, int mrefs2, - int prefs3, int mrefs3, int prefs4, int mrefs4, - int parity, int clip_max) -@@ -195,7 +196,7 @@ static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1 - FILTER2() - } - --static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1, -+static void __attribute__((optimize("tree-vectorize"))) filter_edge_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, - int w, int prefs, int mrefs, int prefs2, int mrefs2, - int parity, int clip_max, int spat) - { - -From f4012f09da1c57a0aa5db01f9096992d0c385f7b Mon Sep 17 00:00:00 2001 -From: John Cox -Date: Tue, 13 Jun 2023 13:07:55 +0000 -Subject: [PATCH 136/136] v4l2m2m_dec: Fix h264 reorder size if no sps - initially - -(cherry picked from commit 8832f7924bf47cbca0de251d7b406917f958ebf4) ---- - libavcodec/v4l2_m2m_dec.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 13af62e819..11c83b2d66 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -1024,8 +1024,8 @@ parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s) - if (sps) { - avctx->profile = ff_h264_get_profile(sps); - avctx->level = sps->level_idc; -+ s->reorder_size = sps->num_reorder_frames; - } -- s->reorder_size = sps->num_reorder_frames; - } - ff_h264_ps_uninit(&ps); - break; ++static void tweak(matrix *a) ++{ ++ for (int j = 4; j < a->height; ++j) ++ for (int i = 0; i < a->width; ++i) { ++ float *p = a->d + j * a->width + i; ++ *p += 1; ++ } ++} ++ ++/* The VC-1 spec places restrictions on the values permitted at three ++ * different stages: ++ * - D: the input coefficients in frequency domain ++ * - E: the intermediate coefficients, inverse-transformed only horizontally ++ * - R: the fully inverse-transformed coefficients ++ * ++ * To fully cater for the ranges specified requires various intermediate ++ * values to be held to 17-bit precision; yet these conditions do not appear ++ * to be utilised in real-world streams. At least some assembly ++ * implementations have chosen to restrict these values to 16-bit precision, ++ * to accelerate the decoding of real-world streams at the cost of strict ++ * adherence to the spec. To avoid our test marking these as failures, ++ * reduce our random inputs. ++ */ ++#define ATTENUATION 4 ++ ++static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height) ++{ ++ matrix *raw, *tmp, *D, *E, *R; ++ raw = new_matrix(width, height); ++ for (int i = 0; i < width * height; ++i) ++ raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION; ++ tmp = multiply(height == 8 ? &T8 : &T4, raw); ++ D = multiply(tmp, width == 8 ? &T8t : &T4t); ++ normalise(D); ++ divide_and_round_nearest(D, 1); ++ for (int i = 0; i < width * height; ++i) { ++ if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) { ++ /* Rare, so simply try again */ ++ av_free(raw); ++ av_free(tmp); ++ av_free(D); ++ return generate_inverse_quantized_transform_coefficients(width, height); ++ } ++ } ++ E = multiply(D, width == 8 ? &T8 : &T4); ++ divide_and_round_nearest(E, 8); ++ for (int i = 0; i < width * height; ++i) ++ if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) { ++ /* Rare, so simply try again */ ++ av_free(raw); ++ av_free(tmp); ++ av_free(D); ++ av_free(E); ++ return generate_inverse_quantized_transform_coefficients(width, height); ++ } ++ R = multiply(height == 8 ? &T8t : &T4t, E); ++ tweak(R); ++ divide_and_round_nearest(R, 128); ++ for (int i = 0; i < width * height; ++i) ++ if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) { ++ /* Rare, so simply try again */ ++ av_free(raw); ++ av_free(tmp); ++ av_free(D); ++ av_free(E); ++ av_free(R); ++ return generate_inverse_quantized_transform_coefficients(width, height); ++ } ++ av_free(raw); ++ av_free(tmp); ++ av_free(E); ++ av_free(R); ++ return D; ++} ++ ++#define RANDOMIZE_BUFFER16(name, size) \ ++ do { \ ++ int i; \ ++ for (i = 0; i < size; ++i) { \ ++ uint16_t r = rnd(); \ ++ AV_WN16A(name##0 + i, r); \ ++ AV_WN16A(name##1 + i, r); \ ++ } \ ++ } while (0) ++ ++#define RANDOMIZE_BUFFER8(name, size) \ ++ do { \ ++ int i; \ ++ for (i = 0; i < size; ++i) { \ ++ uint8_t r = rnd(); \ ++ name##0[i] = r; \ ++ name##1[i] = r; \ ++ } \ ++ } while (0) ++ ++#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size) \ ++ do { \ ++ uint8_t *p##0 = name##0, *p##1 = name##1; \ ++ int i = (size); \ ++ while (i-- > 0) { \ ++ int x = 0x80 | (rnd() & 0x7F); \ ++ x >>= rnd() % 9; \ ++ if (rnd() & 1) \ ++ x = -x; \ ++ *p##1++ = *p##0++ = 0x80 + x; \ ++ } \ ++ } while (0) ++ ++static void check_inv_trans_inplace(void) ++{ ++ /* Inverse transform input coefficients are stored in a 16-bit buffer ++ * with row stride of 8 coefficients irrespective of transform size. ++ * vc1_inv_trans_8x8 differs from the others in two ways: coefficients ++ * are stored in column-major order, and the outputs are written back ++ * to the input buffer, so we oversize it slightly to catch overruns. */ ++ LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]); ++ LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]); ++ ++ VC1DSPContext h; ++ ++ ff_vc1dsp_init(&h); ++ ++ if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) { ++ matrix *coeffs; ++ declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *); ++ RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8); ++ coeffs = generate_inverse_quantized_transform_coefficients(8, 8); ++ for (int j = 0; j < 8; ++j) ++ for (int i = 0; i < 8; ++i) { ++ int idx = 8 + i * 8 + j; ++ inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i]; ++ } ++ call_ref(inv_trans_in0 + 8); ++ call_new(inv_trans_in1 + 8); ++ if (memcmp(inv_trans_in0, inv_trans_in1, 10 * 8 * sizeof (int16_t))) ++ fail(); ++ bench_new(inv_trans_in1 + 8); ++ av_free(coeffs); ++ } ++} ++ ++static void check_inv_trans_adding(void) ++{ ++ /* Inverse transform input coefficients are stored in a 16-bit buffer ++ * with row stride of 8 coefficients irrespective of transform size. */ ++ LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]); ++ LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]); ++ ++ /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and ++ * added with saturation to an array of unsigned 8-bit values. Oversize ++ * this by 8 samples left and right and one row above and below. */ ++ LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]); ++ LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]); ++ ++ VC1DSPContext h; ++ ++ const test tests[] = { ++ VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4) ++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8) ++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4) ++ VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8) ++ VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4) ++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8) ++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4) ++ }; ++ ++ ff_vc1dsp_init(&h); ++ ++ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { ++ void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset); ++ if (check_func(func, "vc1dsp.%s", tests[t].name)) { ++ matrix *coeffs; ++ declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *); ++ RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8); ++ RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24); ++ coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height); ++ for (int j = 0; j < tests[t].height; ++j) ++ for (int i = 0; i < tests[t].width; ++i) { ++ int idx = j * 8 + i; ++ inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i]; ++ } ++ call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0); ++ call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1); ++ if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24)) ++ fail(); ++ bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8); ++ av_free(coeffs); ++ } ++ } ++} ++ ++static void check_loop_filter(void) ++{ ++ /* Deblocking filter buffers are big enough to hold a 16x16 block, ++ * plus 16 columns left and 4 rows above to hold filter inputs ++ * (depending on whether v or h neighbouring block edge, oversized ++ * horizontally to maintain 16-byte alignment) plus 16 columns and ++ * 4 rows below to catch write overflows */ ++ LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]); ++ LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]); ++ ++ VC1DSPContext h; ++ ++ const test tests[] = { ++ VC1DSP_TEST(vc1_v_loop_filter4) ++ VC1DSP_TEST(vc1_h_loop_filter4) ++ VC1DSP_TEST(vc1_v_loop_filter8) ++ VC1DSP_TEST(vc1_h_loop_filter8) ++ VC1DSP_TEST(vc1_v_loop_filter16) ++ VC1DSP_TEST(vc1_h_loop_filter16) ++ }; ++ ++ ff_vc1dsp_init(&h); ++ ++ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { ++ void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset); ++ declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int); ++ if (check_func(func, "vc1dsp.%s", tests[t].name)) { ++ for (int count = 1000; count > 0; --count) { ++ int pq = rnd() % 31 + 1; ++ RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48); ++ call_ref(filter_buf0 + 4 * 48 + 16, 48, pq); ++ call_new(filter_buf1 + 4 * 48 + 16, 48, pq); ++ if (memcmp(filter_buf0, filter_buf1, 24 * 48)) ++ fail(); ++ } ++ } ++ for (int j = 0; j < 24; ++j) ++ for (int i = 0; i < 48; ++i) ++ filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4); ++ if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name)) ++ bench_new(filter_buf1 + 4 * 48 + 16, 48, 1); ++ if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name)) ++ bench_new(filter_buf1 + 4 * 48 + 16, 48, 31); ++ } ++} ++ ++#define TEST_UNESCAPE \ ++ do { \ ++ for (int count = 100; count > 0; --count) { \ ++ escaped_offset = rnd() & 7; \ ++ unescaped_offset = rnd() & 7; \ ++ escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7); \ ++ RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE); \ ++ len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \ ++ len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \ ++ if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE)) \ ++ fail(); \ ++ } \ ++ } while (0) ++ ++static void check_unescape(void) ++{ ++ /* This appears to be a typical length of buffer in use */ ++#define LOG2_UNESCAPE_BUF_SIZE 17 ++#define UNESCAPE_BUF_SIZE (1u< ++#include "checkasm.h" ++#include "libavcodec/internal.h" ++#include "libavfilter/bwdif.h" ++#include "libavutil/mem_internal.h" ++ ++#define WIDTH 256 ++ ++#define randomize_buffers(buf0, buf1, mask, count) \ ++ for (size_t i = 0; i < count; i++) \ ++ buf0[i] = buf1[i] = rnd() & mask ++ ++#define randomize_overflow_check(buf0, buf1, mask, count) \ ++ for (size_t i = 0; i < count; i++) \ ++ buf0[i] = buf1[i] = (rnd() & 1) != 0 ? mask : 0; ++ ++#define BODY(type, depth) \ ++ do { \ ++ type prev0[9*WIDTH], prev1[9*WIDTH]; \ ++ type next0[9*WIDTH], next1[9*WIDTH]; \ ++ type cur0[9*WIDTH], cur1[9*WIDTH]; \ ++ type dst0[WIDTH], dst1[WIDTH]; \ ++ const int stride = WIDTH; \ ++ const int mask = (1<"${SMB_LOCK}" +flock 200 + +SMB_TMP=$(mktemp -p ${SMB_DIR}) + SMB_USERCONF_IS_VALID=no SMB_CONFIG_VERSION=4 # If user config is based on legacy OpenELEC, or old version (or no version) # then don't use it, and log a warning. -if [ -f $SMB_USERCONF ]; then - SMB_IS_LEGACY="$(awk 'NR <= 2 && /This file is part of OpenELEC/{ print }' $SMB_USERCONF)" - SMB_THIS_VER="$(awk '/^# samba.conf v[0-9\.]*/{ print substr($3,2); exit }' $SMB_USERCONF)" +if [ -f ${SMB_USERCONF} ]; then + SMB_IS_LEGACY="$(awk 'NR <= 2 && /This file is part of OpenELEC/{ print }' ${SMB_USERCONF})" + SMB_THIS_VER="$(awk '/^# samba.conf v[0-9\.]*/{ print substr($3,2); exit }' ${SMB_USERCONF})" if [ -n "${SMB_IS_LEGACY}" ]; then - echo "WARNING: Ignoring user config $SMB_USERCONF due to incompatibility [Old style OpenELEC]" + echo "WARNING: Ignoring user config ${SMB_USERCONF} due to incompatibility [Old style OpenELEC]" elif [ -z "${SMB_THIS_VER}" ]; then - echo "WARNING: Ignoring user config $SMB_USERCONF due to incompatibility [version is unknown or invalid]" + echo "WARNING: Ignoring user config ${SMB_USERCONF} due to incompatibility [version is unknown or invalid]" elif [ ${SMB_THIS_VER} != ${SMB_CONFIG_VERSION} ]; then - echo "WARNING: Ignoring user config $SMB_USERCONF due to incompatibility [version ${SMB_THIS_VER} is not the required version $SMB_CONFIG_VERSION]" + echo "WARNING: Ignoring user config ${SMB_USERCONF} due to incompatibility [version ${SMB_THIS_VER} is not the required version ${SMB_CONFIG_VERSION}]" else SMB_USERCONF_IS_VALID=yes fi fi -mkdir -p $(dirname $SMB_CONF) - if [ $SMB_USERCONF_IS_VALID = yes ]; then - cp $SMB_USERCONF $SMB_CONF - else - cp $SMB_DEFCONF $SMB_CONF - fi - -# Generate smb.conf, unless disabled -if [ ! -f /storage/.cache/services/samba.disabled ]; then - /usr/lib/samba/smbd-config +if [ ${SMB_USERCONF_IS_VALID} = yes ]; then + cp ${SMB_USERCONF} ${SMB_TMP} +else + cp ${SMB_DEFCONF} ${SMB_TMP} fi +echo >>${SMB_TMP} + +if [ ! -f /storage/.cache/services/samba.disabled ]; then + + ### Generate smb.conf + + if [ ! -f /storage/.cache/services/samba.conf ]; then + cp /usr/share/services/samba.conf /storage/.cache/services + fi + + # Specify defaults here, in case these new properties not yet added in .cache + SAMBA_WORKGROUP=WORKGROUP + SAMBA_MINPROTOCOL=SMB2 + SAMBA_MAXPROTOCOL=SMB3 + + . /storage/.cache/services/samba.conf + + # fixup synonyms + sed -i 's/browsable/browseable/g; s/writable/writeable/g' ${SMB_TMP} + + # handle external drives + if [ "${SAMBA_AUTOSHARE}" = "true" ] ; then + for dir in /media/* ; do + if [ -d "$dir" ] ; then + name=$(basename "$dir") + echo -e "[$name]\n path = $dir\n available = yes\n browseable = yes\n public = yes\n writeable = yes\n" >> ${SMB_TMP} + fi + done + fi + + # Allow access to a "failed" (safe mode) Kodi installation + if [ -d /storage/.kodi.FAILED ]; then + echo -e "[Kodi-Failed]\n path = /storage/.kodi.FAILED\n available = yes\n browseable = yes\n public = yes\n writeable = yes\n" >> ${SMB_TMP} + fi + + ADD_CONFIG= + + # If workgroup is not set, don't set it - who knows, user may know better. + if [ -n "$SAMBA_WORKGROUP" ]; then + # Remove any existing workgroup setting + sed -E '/^[[:space:]]*workgroup[[:space:]]*=/d' -i ${SMB_TMP} + ADD_CONFIG="${ADD_CONFIG} workgroup = ${SAMBA_WORKGROUP:-WORKGROUP}\n" + fi + + ADD_CONFIG="${ADD_CONFIG} server min protocol = ${SAMBA_MINPROTOCOL/SMB1/NT1}\n" + ADD_CONFIG="${ADD_CONFIG} server max protocol = ${SAMBA_MAXPROTOCOL/SMB1/NT1}\n" + + # Add extra config after [global], escaping spaces so that all are retained by sed + sed -e "/\[global\]/ a ${ADD_CONFIG// /\\ }" -i ${SMB_TMP} + + if [ "${SAMBA_SECURE}" = "true" -a -n "${SAMBA_USERNAME}" -a -n "${SAMBA_PASSWORD}" ] ; then + # username map: first line makes sure plain root does not work all the time + # processing continues, so if user chooses root as username, second line overrides the first + # this is done always in case user uses passwords in userconf. + # many thanks to viljoviitanen for this + sed -e 's|^.[ \t]*.public.=.*| public = no |' \ + -e 's|^.[ \t]*.username map.=.*||' \ + -e 's|^.[ \t]*.security.=.*| security = user\n username map = /run/samba/samba.map|' \ + -e 's|^.[ \t]*.map.to.guest.=.*| map to guest = Never|' \ + -i ${SMB_TMP} + + printf "%s\n%s" "${SAMBA_PASSWORD}" "${SAMBA_PASSWORD}" | smbpasswd -c ${SMB_TMP} -s -a root + printf 'nobody = root\nroot = "%s"\n' "${SAMBA_USERNAME}" > /run/samba/samba.map + + else + sed -e 's|^.[ \t]*.public.=.*| public = yes |' \ + -e 's|^.[ \t]*.username map.=.*||' \ + -e 's|^.[ \t]*.security.=.*| security = user|' \ + -e 's|^.[ \t]*.map.to.guest.=.*| map to guest = Bad User|' \ + -i ${SMB_TMP} + fi +fi + +mv -f ${SMB_TMP} ${SMB_CONF} + exit 0 diff --git a/packages/network/samba/scripts/smbd-config b/packages/network/samba/scripts/smbd-config deleted file mode 100755 index aed4730a3a..0000000000 --- a/packages/network/samba/scripts/smbd-config +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/sh - -# SPDX-License-Identifier: GPL-2.0-or-later -# Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv) -# Copyright (C) 2020-present Team LibreELEC (https://libreelec.tv) - -SMB_CONF="/run/samba/smb.conf" -SMB_TMP="$(mktemp -p /run/samba)" - -cp -f $SMB_CONF $SMB_TMP - -if [ ! -f /storage/.cache/services/samba.conf ]; then - cp /usr/share/services/samba.conf /storage/.cache/services -fi - -# Specify defaults here, in case these new properties not yet added in .cache -SAMBA_WORKGROUP=WORKGROUP -SAMBA_MINPROTOCOL=SMB2 -SAMBA_MAXPROTOCOL=SMB3 - -. /storage/.cache/services/samba.conf - -# fixup synonyms -sed -i 's/browsable/browseable/g; s/writable/writeable/g' $SMB_TMP - -# handle external drives -if [ "$SAMBA_AUTOSHARE" == "true" ] ; then - for dir in /media/* ; do - if [ -d "$dir" ] ; then - name=$(basename "$dir") - echo -e "[$name]\n path = $dir\n available = yes\n browseable = yes\n public = yes\n writeable = yes\n" >> $SMB_TMP - fi - done -fi - -# Allow access to a "failed" (safe mode) Kodi installation -if [ -d /storage/.kodi.FAILED ]; then - echo -e "[Kodi-Failed]\n path = /storage/.kodi.FAILED\n available = yes\n browseable = yes\n public = yes\n writeable = yes\n" >> $SMB_TMP -fi - -ADD_CONFIG= - -# If workgroup is not set, don't set it - who knows, user may know better. -if [ -n "$SAMBA_WORKGROUP" ]; then - # Remove any existing workgroup setting - sed -E '/^[[:space:]]*workgroup[[:space:]]*=/d' -i $SMB_TMP - ADD_CONFIG="${ADD_CONFIG} workgroup = ${SAMBA_WORKGROUP:-WORKGROUP}\n" -fi - -ADD_CONFIG="${ADD_CONFIG} server min protocol = ${SAMBA_MINPROTOCOL/SMB1/NT1}\n" -ADD_CONFIG="${ADD_CONFIG} server max protocol = ${SAMBA_MAXPROTOCOL/SMB1/NT1}\n" - -# Add extra config after [global], escaping spaces so that all are retained by sed -sed -e "/\[global\]/ a ${ADD_CONFIG// /\\ }" -i $SMB_TMP - -if [ "$SAMBA_SECURE" == "true" -a ! "$SAMBA_USERNAME" == "" -a ! "$SAMBA_PASSWORD" == "" ] ; then - # username map: first line makes sure plain root does not work all the time - # processing continues, so if user chooses root as username, second line overrides the first - # this is done always in case user uses passwords in userconf. - # many thanks to viljoviitanen for this - printf "%s\n%s" "$SAMBA_PASSWORD" "$SAMBA_PASSWORD" | smbpasswd -s -a root >/dev/null 2>&1 - printf "nobody = root\nroot = %s" "$SAMBA_USERNAME" > /run/samba/samba.map - - sed -e 's|^.[ \t]*.public.=.*| public = no |' \ - -e 's|^.[ \t]*.username map.=.*||' \ - -e 's|^.[ \t]*.security.=.*| security = user\n username map = /run/samba/samba.map|' \ - -e 's|^.[ \t]*.map.to.guest.=.*| map to guest = Never|' \ - -i $SMB_TMP -else - sed -e 's|^.[ \t]*.public.=.*| public = yes |' \ - -e 's|^.[ \t]*.username map.=.*||' \ - -e 's|^.[ \t]*.security.=.*| security = user|' \ - -e 's|^.[ \t]*.map.to.guest.=.*| map to guest = Bad User|' \ - -i $SMB_TMP -fi - -mv -f $SMB_TMP $SMB_CONF diff --git a/packages/network/wireless-regdb/package.mk b/packages/network/wireless-regdb/package.mk index fa3a7a9d62..27860afd03 100644 --- a/packages/network/wireless-regdb/package.mk +++ b/packages/network/wireless-regdb/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="wireless-regdb" -PKG_VERSION="2023.05.03" -PKG_SHA256="f254d08ab3765aeae2b856222e11a95d44aef519a6663877c71ef68fae4c8c12" +PKG_VERSION="2023.09.01" +PKG_SHA256="26d4c2a727cc59239b84735aad856b7c7d0b04e30aa5c235c4f7f47f5f053491" PKG_LICENSE="GPL" PKG_SITE="https://wireless.wiki.kernel.org/en/developers/regulatory/wireless-regdb" PKG_URL="https://www.kernel.org/pub/software/network/${PKG_NAME}/${PKG_NAME}-${PKG_VERSION}.tar.xz" diff --git a/packages/security/openssl/cert/cacert.pem b/packages/security/openssl/cert/cacert.pem index ab8c20fa53..a1617cb833 100644 --- a/packages/security/openssl/cert/cacert.pem +++ b/packages/security/openssl/cert/cacert.pem @@ -1,7 +1,7 @@ ## ## Bundle of CA Root Certificates ## -## Certificate data from Mozilla as of: Fri Jul 21 14:36:19 2023 GMT +## Certificate data from Mozilla as of: Sat Nov 18 22:59:13 2023 GMT ## ## This is a bundle of X.509 certificates of public Certificate Authorities ## (CA). These were automatically extracted from Mozilla's root certificates @@ -14,7 +14,7 @@ ## Just configure this file as the SSLCACertificateFile. ## ## Conversion done with mk-ca-bundle.pl version 1.29. -## SHA256: 0ff137babc6a5561a9cfbe9f29558972e5b528202681b7d3803d03a3e82922bd +## SHA256: 1970dd65858925d68498d2356aea6d03f764422523c5887deca8ce3ba9e1f845 ## @@ -200,27 +200,6 @@ vGJHvOB0K7Lrfb5BG7XARsWhIstfTsEokt4YutUqKLsRixeTmJlglFwjz1onl14LBQaTNx47aTbr qZ5hHY8y2o4M1nQ+ewkk2gF3R8Q7zTSMmfXK4SVhM7JZG+Ju1zdXtg2pEto= -----END CERTIFICATE----- -Security Communication Root CA -============================== ------BEGIN CERTIFICATE----- -MIIDWjCCAkKgAwIBAgIBADANBgkqhkiG9w0BAQUFADBQMQswCQYDVQQGEwJKUDEYMBYGA1UEChMP -U0VDT00gVHJ1c3QubmV0MScwJQYDVQQLEx5TZWN1cml0eSBDb21tdW5pY2F0aW9uIFJvb3RDQTEw -HhcNMDMwOTMwMDQyMDQ5WhcNMjMwOTMwMDQyMDQ5WjBQMQswCQYDVQQGEwJKUDEYMBYGA1UEChMP -U0VDT00gVHJ1c3QubmV0MScwJQYDVQQLEx5TZWN1cml0eSBDb21tdW5pY2F0aW9uIFJvb3RDQTEw -ggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQCzs/5/022x7xZ8V6UMbXaKL0u/ZPtM7orw -8yl89f/uKuDp6bpbZCKamm8sOiZpUQWZJtzVHGpxxpp9Hp3dfGzGjGdnSj74cbAZJ6kJDKaVv0uM -DPpVmDvY6CKhS3E4eayXkmmziX7qIWgGmBSWh9JhNrxtJ1aeV+7AwFb9Ms+k2Y7CI9eNqPPYJayX -5HA49LY6tJ07lyZDo6G8SVlyTCMwhwFY9k6+HGhWZq/NQV3Is00qVUarH9oe4kA92819uZKAnDfd -DJZkndwi92SL32HeFZRSFaB9UslLqCHJxrHty8OVYNEP8Ktw+N/LTX7s1vqr2b1/VPKl6Xn62dZ2 -JChzAgMBAAGjPzA9MB0GA1UdDgQWBBSgc0mZaNyFW2XjmygvV5+9M7wHSDALBgNVHQ8EBAMCAQYw -DwYDVR0TAQH/BAUwAwEB/zANBgkqhkiG9w0BAQUFAAOCAQEAaECpqLvkT115swW1F7NgE+vGkl3g -0dNq/vu+m22/xwVtWSDEHPC32oRYAmP6SBbvT6UL90qY8j+eG61Ha2POCEfrUj94nK9NrvjVT8+a -mCoQQTlSxN3Zmw7vkwGusi7KaEIkQmywszo+zenaSMQVy+n5Bw+SUEmK3TGXX8npN6o7WWWXlDLJ -s58+OmJYxUmtYg5xpTKqL8aJdkNAExNnPaJUJRDL8Try2frbSVa7pv6nQTXD4IhhyYjH3zYQIphZ -6rBK+1YWc26sTfcioU+tHXotRSflMMFe8toTyyVCUZVHA4xsIcx0Qu1T/zOLjw9XARYvz6buyXAi -FL39vmwLAw== ------END CERTIFICATE----- - XRamp Global CA Root ==================== -----BEGIN CERTIFICATE----- @@ -669,39 +648,6 @@ YIvDQVETI53O9zJrlAGomecsMx86OyXShkDOOyyGeMlhLxS67ttVb9+E7gUJTb0o2HLO02JQZR7r kpeDMdmztcpHWD9f -----END CERTIFICATE----- -Autoridad de Certificacion Firmaprofesional CIF A62634068 -========================================================= ------BEGIN CERTIFICATE----- -MIIGFDCCA/ygAwIBAgIIU+w77vuySF8wDQYJKoZIhvcNAQEFBQAwUTELMAkGA1UEBhMCRVMxQjBA -BgNVBAMMOUF1dG9yaWRhZCBkZSBDZXJ0aWZpY2FjaW9uIEZpcm1hcHJvZmVzaW9uYWwgQ0lGIEE2 -MjYzNDA2ODAeFw0wOTA1MjAwODM4MTVaFw0zMDEyMzEwODM4MTVaMFExCzAJBgNVBAYTAkVTMUIw -QAYDVQQDDDlBdXRvcmlkYWQgZGUgQ2VydGlmaWNhY2lvbiBGaXJtYXByb2Zlc2lvbmFsIENJRiBB -NjI2MzQwNjgwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDKlmuO6vj78aI14H9M2uDD -Utd9thDIAl6zQyrET2qyyhxdKJp4ERppWVevtSBC5IsP5t9bpgOSL/UR5GLXMnE42QQMcas9UX4P -B99jBVzpv5RvwSmCwLTaUbDBPLutN0pcyvFLNg4kq7/DhHf9qFD0sefGL9ItWY16Ck6WaVICqjaY -7Pz6FIMMNx/Jkjd/14Et5cS54D40/mf0PmbR0/RAz15iNA9wBj4gGFrO93IbJWyTdBSTo3OxDqqH -ECNZXyAFGUftaI6SEspd/NYrspI8IM/hX68gvqB2f3bl7BqGYTM+53u0P6APjqK5am+5hyZvQWyI -plD9amML9ZMWGxmPsu2bm8mQ9QEM3xk9Dz44I8kvjwzRAv4bVdZO0I08r0+k8/6vKtMFnXkIoctX -MbScyJCyZ/QYFpM6/EfY0XiWMR+6KwxfXZmtY4laJCB22N/9q06mIqqdXuYnin1oKaPnirjaEbsX -LZmdEyRG98Xi2J+Of8ePdG1asuhy9azuJBCtLxTa/y2aRnFHvkLfuwHb9H/TKI8xWVvTyQKmtFLK -bpf7Q8UIJm+K9Lv9nyiqDdVF8xM6HdjAeI9BZzwelGSuewvF6NkBiDkal4ZkQdU7hwxu+g/GvUgU -vzlN1J5Bto+WHWOWk9mVBngxaJ43BjuAiUVhOSPHG0SjFeUc+JIwuwIDAQABo4HvMIHsMBIGA1Ud -EwEB/wQIMAYBAf8CAQEwDgYDVR0PAQH/BAQDAgEGMB0GA1UdDgQWBBRlzeurNR4APn7VdMActHNH -DhpkLzCBpgYDVR0gBIGeMIGbMIGYBgRVHSAAMIGPMC8GCCsGAQUFBwIBFiNodHRwOi8vd3d3LmZp -cm1hcHJvZmVzaW9uYWwuY29tL2NwczBcBggrBgEFBQcCAjBQHk4AUABhAHMAZQBvACAAZABlACAA -bABhACAAQgBvAG4AYQBuAG8AdgBhACAANAA3ACAAQgBhAHIAYwBlAGwAbwBuAGEAIAAwADgAMAAx -ADcwDQYJKoZIhvcNAQEFBQADggIBABd9oPm03cXF661LJLWhAqvdpYhKsg9VSytXjDvlMd3+xDLx -51tkljYyGOylMnfX40S2wBEqgLk9am58m9Ot/MPWo+ZkKXzR4Tgegiv/J2Wv+xYVxC5xhOW1//qk -R71kMrv2JYSiJ0L1ILDCExARzRAVukKQKtJE4ZYm6zFIEv0q2skGz3QeqUvVhyj5eTSSPi5E6PaP -T481PyWzOdxjKpBrIF/EUhJOlywqrJ2X3kjyo2bbwtKDlaZmp54lD+kLM5FlClrD2VQS3a/DTg4f -Jl4N3LON7NWBcN7STyQF82xO9UxJZo3R/9ILJUFI/lGExkKvgATP0H5kSeTy36LssUzAKh3ntLFl -osS88Zj0qnAHY7S42jtM+kAiMFsRpvAFDsYCA0irhpuF3dvd6qJ2gHN99ZwExEWN57kci57q13XR -crHedUTnQn3iV2t93Jm8PYMo6oCTjcVMZcFwgbg4/EMxsvYDNEeyrPsiBsse3RdHHF9mudMaotoR -saS8I8nkvof/uZS2+F0gStRf571oe2XyFR7SOqkt6dhrJKyXWERHrVkY8SFlcN7ONGCoQPHzPKTD -KCOM/iczQ0CgFzzr6juwcqajuUpLXhZI9LK8yIySxZ2frHI2vDSANGupi5LAuBft7HZT9SQBjLMi -6Et8Vcad+qMUu2WFbm5PEn4KPJ2V ------END CERTIFICATE----- - Izenpe.com ========== -----BEGIN CERTIFICATE----- @@ -3449,3 +3395,140 @@ TFsR0PXNor6uzFFcw9VUewyu1rkGd4Di7wcaaMxZUa1+XGdrudviB0JbuAEFWDlN5LuYo7Ey7Nmj PqYO5Wue/9vsL3SD3460s6neFE3/MaNFcyT6lSnMEpcEoji2jbDwN/zIIX8/syQbPYtuzE2wFg2W HYMfRsCbvUOZ58SWLs5fyQ== -----END CERTIFICATE----- + +TrustAsia Global Root CA G3 +=========================== +-----BEGIN CERTIFICATE----- +MIIFpTCCA42gAwIBAgIUZPYOZXdhaqs7tOqFhLuxibhxkw8wDQYJKoZIhvcNAQEMBQAwWjELMAkG +A1UEBhMCQ04xJTAjBgNVBAoMHFRydXN0QXNpYSBUZWNobm9sb2dpZXMsIEluYy4xJDAiBgNVBAMM +G1RydXN0QXNpYSBHbG9iYWwgUm9vdCBDQSBHMzAeFw0yMTA1MjAwMjEwMTlaFw00NjA1MTkwMjEw +MTlaMFoxCzAJBgNVBAYTAkNOMSUwIwYDVQQKDBxUcnVzdEFzaWEgVGVjaG5vbG9naWVzLCBJbmMu +MSQwIgYDVQQDDBtUcnVzdEFzaWEgR2xvYmFsIFJvb3QgQ0EgRzMwggIiMA0GCSqGSIb3DQEBAQUA +A4ICDwAwggIKAoICAQDAMYJhkuSUGwoqZdC+BqmHO1ES6nBBruL7dOoKjbmzTNyPtxNST1QY4Sxz +lZHFZjtqz6xjbYdT8PfxObegQ2OwxANdV6nnRM7EoYNl9lA+sX4WuDqKAtCWHwDNBSHvBm3dIZwZ +Q0WhxeiAysKtQGIXBsaqvPPW5vxQfmZCHzyLpnl5hkA1nyDvP+uLRx+PjsXUjrYsyUQE49RDdT/V +P68czH5GX6zfZBCK70bwkPAPLfSIC7Epqq+FqklYqL9joDiR5rPmd2jE+SoZhLsO4fWvieylL1Ag +dB4SQXMeJNnKziyhWTXAyB1GJ2Faj/lN03J5Zh6fFZAhLf3ti1ZwA0pJPn9pMRJpxx5cynoTi+jm +9WAPzJMshH/x/Gr8m0ed262IPfN2dTPXS6TIi/n1Q1hPy8gDVI+lhXgEGvNz8teHHUGf59gXzhqc +D0r83ERoVGjiQTz+LISGNzzNPy+i2+f3VANfWdP3kXjHi3dqFuVJhZBFcnAvkV34PmVACxmZySYg +WmjBNb9Pp1Hx2BErW+Canig7CjoKH8GB5S7wprlppYiU5msTf9FkPz2ccEblooV7WIQn3MSAPmea +mseaMQ4w7OYXQJXZRe0Blqq/DPNL0WP3E1jAuPP6Z92bfW1K/zJMtSU7/xxnD4UiWQWRkUF3gdCF +TIcQcf+eQxuulXUtgQIDAQABo2MwYTAPBgNVHRMBAf8EBTADAQH/MB8GA1UdIwQYMBaAFEDk5PIj +7zjKsK5Xf/IhMBY027ySMB0GA1UdDgQWBBRA5OTyI+84yrCuV3/yITAWNNu8kjAOBgNVHQ8BAf8E +BAMCAQYwDQYJKoZIhvcNAQEMBQADggIBACY7UeFNOPMyGLS0XuFlXsSUT9SnYaP4wM8zAQLpw6o1 +D/GUE3d3NZ4tVlFEbuHGLige/9rsR82XRBf34EzC4Xx8MnpmyFq2XFNFV1pF1AWZLy4jVe5jaN/T +G3inEpQGAHUNcoTpLrxaatXeL1nHo+zSh2bbt1S1JKv0Q3jbSwTEb93mPmY+KfJLaHEih6D4sTNj +duMNhXJEIlU/HHzp/LgV6FL6qj6jITk1dImmasI5+njPtqzn59ZW/yOSLlALqbUHM/Q4X6RJpstl +cHboCoWASzY9M/eVVHUl2qzEc4Jl6VL1XP04lQJqaTDFHApXB64ipCz5xUG3uOyfT0gA+QEEVcys ++TIxxHWVBqB/0Y0n3bOppHKH/lmLmnp0Ft0WpWIp6zqW3IunaFnT63eROfjXy9mPX1onAX1daBli +2MjN9LdyR75bl87yraKZk62Uy5P2EgmVtqvXO9A/EcswFi55gORngS1d7XB4tmBZrOFdRWOPyN9y +aFvqHbgB8X7754qz41SgOAngPN5C8sLtLpvzHzW2NtjjgKGLzZlkD8Kqq7HK9W+eQ42EVJmzbsAS +ZthwEPEGNTNDqJwuuhQxzhB/HIbjj9LV+Hfsm6vxL2PZQl/gZ4FkkfGXL/xuJvYz+NO1+MRiqzFR +JQJ6+N1rZdVtTTDIZbpoFGWsJwt0ivKH +-----END CERTIFICATE----- + +TrustAsia Global Root CA G4 +=========================== +-----BEGIN CERTIFICATE----- +MIICVTCCAdygAwIBAgIUTyNkuI6XY57GU4HBdk7LKnQV1tcwCgYIKoZIzj0EAwMwWjELMAkGA1UE +BhMCQ04xJTAjBgNVBAoMHFRydXN0QXNpYSBUZWNobm9sb2dpZXMsIEluYy4xJDAiBgNVBAMMG1Ry +dXN0QXNpYSBHbG9iYWwgUm9vdCBDQSBHNDAeFw0yMTA1MjAwMjEwMjJaFw00NjA1MTkwMjEwMjJa +MFoxCzAJBgNVBAYTAkNOMSUwIwYDVQQKDBxUcnVzdEFzaWEgVGVjaG5vbG9naWVzLCBJbmMuMSQw +IgYDVQQDDBtUcnVzdEFzaWEgR2xvYmFsIFJvb3QgQ0EgRzQwdjAQBgcqhkjOPQIBBgUrgQQAIgNi +AATxs8045CVD5d4ZCbuBeaIVXxVjAd7Cq92zphtnS4CDr5nLrBfbK5bKfFJV4hrhPVbwLxYI+hW8 +m7tH5j/uqOFMjPXTNvk4XatwmkcN4oFBButJ+bAp3TPsUKV/eSm4IJijYzBhMA8GA1UdEwEB/wQF +MAMBAf8wHwYDVR0jBBgwFoAUpbtKl86zK3+kMd6Xg1mDpm9xy94wHQYDVR0OBBYEFKW7SpfOsyt/ +pDHel4NZg6ZvccveMA4GA1UdDwEB/wQEAwIBBjAKBggqhkjOPQQDAwNnADBkAjBe8usGzEkxn0AA +bbd+NvBNEU/zy4k6LHiRUKNbwMp1JvK/kF0LgoxgKJ/GcJpo5PECMFxYDlZ2z1jD1xCMuo6u47xk +dUfFVZDj/bpV6wfEU6s3qe4hsiFbYI89MvHVI5TWWA== +-----END CERTIFICATE----- + +CommScope Public Trust ECC Root-01 +================================== +-----BEGIN CERTIFICATE----- +MIICHTCCAaOgAwIBAgIUQ3CCd89NXTTxyq4yLzf39H91oJ4wCgYIKoZIzj0EAwMwTjELMAkGA1UE +BhMCVVMxEjAQBgNVBAoMCUNvbW1TY29wZTErMCkGA1UEAwwiQ29tbVNjb3BlIFB1YmxpYyBUcnVz +dCBFQ0MgUm9vdC0wMTAeFw0yMTA0MjgxNzM1NDNaFw00NjA0MjgxNzM1NDJaME4xCzAJBgNVBAYT +AlVTMRIwEAYDVQQKDAlDb21tU2NvcGUxKzApBgNVBAMMIkNvbW1TY29wZSBQdWJsaWMgVHJ1c3Qg +RUNDIFJvb3QtMDEwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAARLNumuV16ocNfQj3Rid8NeeqrltqLx +eP0CflfdkXmcbLlSiFS8LwS+uM32ENEp7LXQoMPwiXAZu1FlxUOcw5tjnSCDPgYLpkJEhRGnSjot +6dZoL0hOUysHP029uax3OVejQjBAMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgEGMB0G +A1UdDgQWBBSOB2LAUN3GGQYARnQE9/OufXVNMDAKBggqhkjOPQQDAwNoADBlAjEAnDPfQeMjqEI2 +Jpc1XHvr20v4qotzVRVcrHgpD7oh2MSg2NED3W3ROT3Ek2DS43KyAjB8xX6I01D1HiXo+k515liW +pDVfG2XqYZpwI7UNo5uSUm9poIyNStDuiw7LR47QjRE= +-----END CERTIFICATE----- + +CommScope Public Trust ECC Root-02 +================================== +-----BEGIN CERTIFICATE----- +MIICHDCCAaOgAwIBAgIUKP2ZYEFHpgE6yhR7H+/5aAiDXX0wCgYIKoZIzj0EAwMwTjELMAkGA1UE +BhMCVVMxEjAQBgNVBAoMCUNvbW1TY29wZTErMCkGA1UEAwwiQ29tbVNjb3BlIFB1YmxpYyBUcnVz +dCBFQ0MgUm9vdC0wMjAeFw0yMTA0MjgxNzQ0NTRaFw00NjA0MjgxNzQ0NTNaME4xCzAJBgNVBAYT +AlVTMRIwEAYDVQQKDAlDb21tU2NvcGUxKzApBgNVBAMMIkNvbW1TY29wZSBQdWJsaWMgVHJ1c3Qg +RUNDIFJvb3QtMDIwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAAR4MIHoYx7l63FRD/cHB8o5mXxO1Q/M +MDALj2aTPs+9xYa9+bG3tD60B8jzljHz7aRP+KNOjSkVWLjVb3/ubCK1sK9IRQq9qEmUv4RDsNuE +SgMjGWdqb8FuvAY5N9GIIvejQjBAMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgEGMB0G +A1UdDgQWBBTmGHX/72DehKT1RsfeSlXjMjZ59TAKBggqhkjOPQQDAwNnADBkAjAmc0l6tqvmSfR9 +Uj/UQQSugEODZXW5hYA4O9Zv5JOGq4/nich/m35rChJVYaoR4HkCMHfoMXGsPHED1oQmHhS48zs7 +3u1Z/GtMMH9ZzkXpc2AVmkzw5l4lIhVtwodZ0LKOag== +-----END CERTIFICATE----- + +CommScope Public Trust RSA Root-01 +================================== +-----BEGIN CERTIFICATE----- +MIIFbDCCA1SgAwIBAgIUPgNJgXUWdDGOTKvVxZAplsU5EN0wDQYJKoZIhvcNAQELBQAwTjELMAkG +A1UEBhMCVVMxEjAQBgNVBAoMCUNvbW1TY29wZTErMCkGA1UEAwwiQ29tbVNjb3BlIFB1YmxpYyBU +cnVzdCBSU0EgUm9vdC0wMTAeFw0yMTA0MjgxNjQ1NTRaFw00NjA0MjgxNjQ1NTNaME4xCzAJBgNV +BAYTAlVTMRIwEAYDVQQKDAlDb21tU2NvcGUxKzApBgNVBAMMIkNvbW1TY29wZSBQdWJsaWMgVHJ1 +c3QgUlNBIFJvb3QtMDEwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQCwSGWjDR1C45Ft +nYSkYZYSwu3D2iM0GXb26v1VWvZVAVMP8syMl0+5UMuzAURWlv2bKOx7dAvnQmtVzslhsuitQDy6 +uUEKBU8bJoWPQ7VAtYXR1HHcg0Hz9kXHgKKEUJdGzqAMxGBWBB0HW0alDrJLpA6lfO741GIDuZNq +ihS4cPgugkY4Iw50x2tBt9Apo52AsH53k2NC+zSDO3OjWiE260f6GBfZumbCk6SP/F2krfxQapWs +vCQz0b2If4b19bJzKo98rwjyGpg/qYFlP8GMicWWMJoKz/TUyDTtnS+8jTiGU+6Xn6myY5QXjQ/c +Zip8UlF1y5mO6D1cv547KI2DAg+pn3LiLCuz3GaXAEDQpFSOm117RTYm1nJD68/A6g3czhLmfTif +BSeolz7pUcZsBSjBAg/pGG3svZwG1KdJ9FQFa2ww8esD1eo9anbCyxooSU1/ZOD6K9pzg4H/kQO9 +lLvkuI6cMmPNn7togbGEW682v3fuHX/3SZtS7NJ3Wn2RnU3COS3kuoL4b/JOHg9O5j9ZpSPcPYeo +KFgo0fEbNttPxP/hjFtyjMcmAyejOQoBqsCyMWCDIqFPEgkBEa801M/XrmLTBQe0MXXgDW1XT2mH ++VepuhX2yFJtocucH+X8eKg1mp9BFM6ltM6UCBwJrVbl2rZJmkrqYxhTnCwuwwIDAQABo0IwQDAP +BgNVHRMBAf8EBTADAQH/MA4GA1UdDwEB/wQEAwIBBjAdBgNVHQ4EFgQUN12mmnQywsL5x6YVEFm4 +5P3luG0wDQYJKoZIhvcNAQELBQADggIBAK+nz97/4L1CjU3lIpbfaOp9TSp90K09FlxD533Ahuh6 +NWPxzIHIxgvoLlI1pKZJkGNRrDSsBTtXAOnTYtPZKdVUvhwQkZyybf5Z/Xn36lbQnmhUQo8mUuJM +3y+Xpi/SB5io82BdS5pYV4jvguX6r2yBS5KPQJqTRlnLX3gWsWc+QgvfKNmwrZggvkN80V4aCRck +jXtdlemrwWCrWxhkgPut4AZ9HcpZuPN4KWfGVh2vtrV0KnahP/t1MJ+UXjulYPPLXAziDslg+Mkf +Foom3ecnf+slpoq9uC02EJqxWE2aaE9gVOX2RhOOiKy8IUISrcZKiX2bwdgt6ZYD9KJ0DLwAHb/W +NyVntHKLr4W96ioDj8z7PEQkguIBpQtZtjSNMgsSDesnwv1B10A8ckYpwIzqug/xBpMu95yo9GA+ +o/E4Xo4TwbM6l4c/ksp4qRyv0LAbJh6+cOx69TOY6lz/KwsETkPdY34Op054A5U+1C0wlREQKC6/ +oAI+/15Z0wUOlV9TRe9rh9VIzRamloPh37MG88EU26fsHItdkJANclHnYfkUyq+Dj7+vsQpZXdxc +1+SWrVtgHdqul7I52Qb1dgAT+GhMIbA1xNxVssnBQVocicCMb3SgazNNtQEo/a2tiRc7ppqEvOuM +6sRxJKi6KfkIsidWNTJf6jn7MZrVGczw +-----END CERTIFICATE----- + +CommScope Public Trust RSA Root-02 +================================== +-----BEGIN CERTIFICATE----- +MIIFbDCCA1SgAwIBAgIUVBa/O345lXGN0aoApYYNK496BU4wDQYJKoZIhvcNAQELBQAwTjELMAkG +A1UEBhMCVVMxEjAQBgNVBAoMCUNvbW1TY29wZTErMCkGA1UEAwwiQ29tbVNjb3BlIFB1YmxpYyBU +cnVzdCBSU0EgUm9vdC0wMjAeFw0yMTA0MjgxNzE2NDNaFw00NjA0MjgxNzE2NDJaME4xCzAJBgNV +BAYTAlVTMRIwEAYDVQQKDAlDb21tU2NvcGUxKzApBgNVBAMMIkNvbW1TY29wZSBQdWJsaWMgVHJ1 +c3QgUlNBIFJvb3QtMDIwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDh+g77aAASyE3V +rCLENQE7xVTlWXZjpX/rwcRqmL0yjReA61260WI9JSMZNRTpf4mnG2I81lDnNJUDMrG0kyI9p+Kx +7eZ7Ti6Hmw0zdQreqjXnfuU2mKKuJZ6VszKWpCtYHu8//mI0SFHRtI1CrWDaSWqVcN3SAOLMV2MC +e5bdSZdbkk6V0/nLKR8YSvgBKtJjCW4k6YnS5cciTNxzhkcAqg2Ijq6FfUrpuzNPDlJwnZXjfG2W +Wy09X6GDRl224yW4fKcZgBzqZUPckXk2LHR88mcGyYnJ27/aaL8j7dxrrSiDeS/sOKUNNwFnJ5rp +M9kzXzehxfCrPfp4sOcsn/Y+n2Dg70jpkEUeBVF4GiwSLFworA2iI540jwXmojPOEXcT1A6kHkIf +hs1w/tkuFT0du7jyU1fbzMZ0KZwYszZ1OC4PVKH4kh+Jlk+71O6d6Ts2QrUKOyrUZHk2EOH5kQMr +eyBUzQ0ZGshBMjTRsJnhkB4BQDa1t/qp5Xd1pCKBXbCL5CcSD1SIxtuFdOa3wNemKfrb3vOTlycE +VS8KbzfFPROvCgCpLIscgSjX74Yxqa7ybrjKaixUR9gqiC6vwQcQeKwRoi9C8DfF8rhW3Q5iLc4t +Vn5V8qdE9isy9COoR+jUKgF4z2rDN6ieZdIs5fq6M8EGRPbmz6UNp2YINIos8wIDAQABo0IwQDAP +BgNVHRMBAf8EBTADAQH/MA4GA1UdDwEB/wQEAwIBBjAdBgNVHQ4EFgQUR9DnsSL/nSz12Vdgs7Gx +cJXvYXowDQYJKoZIhvcNAQELBQADggIBAIZpsU0v6Z9PIpNojuQhmaPORVMbc0RTAIFhzTHjCLqB +KCh6krm2qMhDnscTJk3C2OVVnJJdUNjCK9v+5qiXz1I6JMNlZFxHMaNlNRPDk7n3+VGXu6TwYofF +1gbTl4MgqX67tiHCpQ2EAOHyJxCDut0DgdXdaMNmEMjRdrSzbymeAPnCKfWxkxlSaRosTKCL4BWa +MS/TiJVZbuXEs1DIFAhKm4sTg7GkcrI7djNB3NyqpgdvHSQSn8h2vS/ZjvQs7rfSOBAkNlEv41xd +gSGn2rtO/+YHqP65DSdsu3BaVXoT6fEqSWnHX4dXTEN5bTpl6TBcQe7rd6VzEojov32u5cSoHw2O +HG1QAk8mGEPej1WFsQs3BWDJVTkSBKEqz3EWnzZRSb9wO55nnPt7eck5HHisd5FUmrh1CoFSl+Nm +YWvtPjgelmFV4ZFUjO2MJB+ByRCac5krFk5yAD9UG/iNuovnFNa2RU9g7Jauwy8CTl2dlklyALKr +dVwPaFsdZcJfMw8eD/A7hvWwTruc9+olBdytoptLFwG+Qt81IR2tq670v64fG9PiO/yzcnMcmyiQ +iRM9HcEARwmWmjgb3bHPDcK0RPOWlc4yOo80nOAXx17Org3bhzjlP1v9mxnhMUF6cKojawHhRUzN +lM47ni3niAIi9G7oyOzWPPO5std3eqx7 +-----END CERTIFICATE----- diff --git a/packages/sysutils/systemd/package.mk b/packages/sysutils/systemd/package.mk index 9af3ea927a..8edc857b0c 100644 --- a/packages/sysutils/systemd/package.mk +++ b/packages/sysutils/systemd/package.mk @@ -202,7 +202,11 @@ post_makeinstall_target() { # tune logind.conf sed -e "s,^.*HandleLidSwitch=.*$,HandleLidSwitch=ignore,g" -i ${INSTALL}/etc/systemd/logind.conf - sed -e "s,^.*HandlePowerKey=.*$,HandlePowerKey=ignore,g" -i ${INSTALL}/etc/systemd/logind.conf + if [ "${DISPLAYSERVER}" = "no" ]; then + sed -e "s,^.*HandlePowerKey=.*$,HandlePowerKey=poweroff,g" -i ${INSTALL}/etc/systemd/logind.conf + else + sed -e "s,^.*HandlePowerKey=.*$,HandlePowerKey=ignore,g" -i ${INSTALL}/etc/systemd/logind.conf + fi if [ "${DISTRO}" = "Lakka" -a "${PROJECT}" = "RPi" ]; then sed -e "s,^.*HandlePowerKey=.*$,HandlePowerKey=poweroff,g" -i $INSTALL/etc/systemd/logind.conf diff --git a/packages/tools/bcm2835-bootloader/files/update.sh b/packages/tools/bcm2835-bootloader/files/update.sh index 1d07b7fe93..500d2079aa 100755 --- a/packages/tools/bcm2835-bootloader/files/update.sh +++ b/packages/tools/bcm2835-bootloader/files/update.sh @@ -17,9 +17,11 @@ mount -o remount,rw $BOOT_ROOT # update bootloader files cp -p $SYSTEM_ROOT/usr/share/bootloader/LICENCE* $BOOT_ROOT -cp -p $SYSTEM_ROOT/usr/share/bootloader/bootcode.bin $BOOT_ROOT -cp -p $SYSTEM_ROOT/usr/share/bootloader/fixup.dat $BOOT_ROOT -cp -p $SYSTEM_ROOT/usr/share/bootloader/start.elf $BOOT_ROOT +for f in bootcode.bin fixup.dat start.elf ; do + if [ -f "${SYSTEM_ROOT}/usr/share/bootloader/$f" ]; then + cp -p "${SYSTEM_ROOT}/usr/share/bootloader/$f" "${BOOT_ROOT}" + fi +done rm -f $BOOT_ROOT/bcm283*.dtb # cleanup excess dtb's used by upstream kernels (ie. not LE) cp -p $SYSTEM_ROOT/usr/share/bootloader/*.dtb $BOOT_ROOT diff --git a/packages/tools/bcm2835-bootloader/package.mk b/packages/tools/bcm2835-bootloader/package.mk index b4d99f118b..b8dcb35443 100644 --- a/packages/tools/bcm2835-bootloader/package.mk +++ b/packages/tools/bcm2835-bootloader/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv) PKG_NAME="bcm2835-bootloader" -PKG_VERSION="543692d23dff7075915bc9c7e34abb3fe28e1c46" -PKG_SHA256="718389513a36ce7073ee26666dfbdfeb259a7e88beaac603c3b7d1f5bde067a2" +PKG_VERSION="fdb9eafae4b83e553593937eae8e77b0193903c3" +PKG_SHA256="ce45b07afce3279f9d31fe12008c5250de4da5491bd9ced2de2f2ebb563aea80" PKG_ARCH="arm aarch64" PKG_LICENSE="nonfree" PKG_SITE="http://www.broadcom.com" @@ -16,14 +16,19 @@ PKG_TOOLCHAIN="manual" makeinstall_target() { mkdir -p ${INSTALL}/usr/share/bootloader cp -PRv LICENCE* ${INSTALL}/usr/share/bootloader - cp -PRv bootcode.bin ${INSTALL}/usr/share/bootloader - if [ "${DEVICE:0:4}" = "RPi4" ]; then - cp -PRv fixup4x.dat ${INSTALL}/usr/share/bootloader/fixup.dat - cp -PRv start4x.elf ${INSTALL}/usr/share/bootloader/start.elf - else - cp -PRv fixup_x.dat ${INSTALL}/usr/share/bootloader/fixup.dat - cp -PRv start_x.elf ${INSTALL}/usr/share/bootloader/start.elf - fi + case "${DEVICE}" in + RPi4) + cp -PRv fixup4x.dat ${INSTALL}/usr/share/bootloader/fixup.dat + cp -PRv start4x.elf ${INSTALL}/usr/share/bootloader/start.elf + ;; + RPi5) + ;; + *) + cp -PRv bootcode.bin ${INSTALL}/usr/share/bootloader + cp -PRv fixup_x.dat ${INSTALL}/usr/share/bootloader/fixup.dat + cp -PRv start_x.elf ${INSTALL}/usr/share/bootloader/start.elf + ;; + esac find_file_path bootloader/update.sh ${PKG_DIR}/files/update.sh && cp -PRv ${FOUND_PATH} ${INSTALL}/usr/share/bootloader find_file_path bootloader/canupdate.sh && cp -PRv ${FOUND_PATH} ${INSTALL}/usr/share/bootloader diff --git a/packages/tools/bcm2835-bootloader/release b/packages/tools/bcm2835-bootloader/release index 7ec99fd3a2..efdb7322e7 100755 --- a/packages/tools/bcm2835-bootloader/release +++ b/packages/tools/bcm2835-bootloader/release @@ -6,9 +6,11 @@ mkdir -p $RELEASE_DIR/3rdparty/bootloader cp -PR $INSTALL/usr/share/bootloader/LICENCE* $RELEASE_DIR/3rdparty/bootloader/ - cp -PR $INSTALL/usr/share/bootloader/bootcode.bin $RELEASE_DIR/3rdparty/bootloader/ - cp -PR $INSTALL/usr/share/bootloader/fixup.dat $RELEASE_DIR/3rdparty/bootloader/ - cp -PR $INSTALL/usr/share/bootloader/start.elf $RELEASE_DIR/3rdparty/bootloader/ + for f in bootcode.bin fixup.dat start.elf ; do + if [ -f "${INSTALL}/usr/share/bootloader/$f" ]; then + cp -PR "${INSTALL}/usr/share/bootloader/$f" "${RELEASE_DIR}/3rdparty/bootloader/" + fi + done cp -PR $INSTALL/usr/share/bootloader/*.dtb $RELEASE_DIR/3rdparty/bootloader/ cp -PR $INSTALL/usr/share/bootloader/overlays $RELEASE_DIR/3rdparty/bootloader/ diff --git a/packages/tools/rpi-eeprom/package.mk b/packages/tools/rpi-eeprom/package.mk index 3be9bc3156..42f320cec5 100644 --- a/packages/tools/rpi-eeprom/package.mk +++ b/packages/tools/rpi-eeprom/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2019-present Team LibreELEC (https://libreelec.tv) PKG_NAME="rpi-eeprom" -PKG_VERSION="75d3a760469130cb537e5d8d504f892336abd62b" -PKG_SHA256="a71573a80149b1c2c4b6d5ec1527ef011611c6883a0cf06c02961fe518384307" +PKG_VERSION="6b14e84a2fb2e1f7220a404f65e7e0985f07c9e5" +PKG_SHA256="3907711bb2ff78a0e9120709b72b04d6d010f93f79d525af0454d3d27a772aca" PKG_LICENSE="BSD-3/custom" PKG_SITE="https://github.com/raspberrypi/rpi-eeprom" PKG_URL="https://github.com/raspberrypi/rpi-eeprom/archive/${PKG_VERSION}.tar.gz" @@ -12,14 +12,20 @@ PKG_LONGDESC="rpi-eeprom: firmware, config and scripts to update RPi4 SPI bootlo PKG_TOOLCHAIN="manual" makeinstall_target() { - DESTDIR=${INSTALL}/$(get_kernel_overlay_dir)/lib/firmware/raspberrypi/bootloader + + if [ "${DEVICE}" = "RPi4" ]; then + _variant="2711" + else + _variant="2712" + fi + + DESTDIR=${INSTALL}/$(get_kernel_overlay_dir)/lib/firmware/raspberrypi/bootloader-${_variant} mkdir -p ${DESTDIR} - _dirs="critical stable" - [ "${LIBREELEC_VERSION}" = "devel" ] && _dirs+=" beta" + _dirs="default latest" for _maindir in ${_dirs}; do - for _dir in ${PKG_BUILD}/firmware/${_maindir} ${PKG_BUILD}/firmware/{_maindir}-*; do + for _dir in ${PKG_BUILD}/firmware-${_variant}/${_maindir} ${PKG_BUILD}/firmware-${_variant}/${_maindir}-*; do [ -d "${_dir}" ] || continue _basedir="$(basename "${_dir}")" @@ -31,14 +37,17 @@ makeinstall_target() { PKG_FW_FILE="$(ls -1 /${_dir}/pieeprom-* 2>/dev/null | tail -1)" [ -n "${PKG_FW_FILE}" ] && cp -PRv "${PKG_FW_FILE}" ${DESTDIR}/${_basedir} - # VIA USB3 - PKG_FW_FILE="$(ls -1 ${_dir}/vl805-*.bin 2>/dev/null | tail -1)" - [ -n "${PKG_FW_FILE}" ] && cp -PRv "${PKG_FW_FILE}" ${DESTDIR}/${_basedir} + if [ "${DEVICE}" = "RPi4" ]; then + # VIA USB3 + PKG_FW_FILE="$(ls -1 ${_dir}/vl805-*.bin 2>/dev/null | tail -1)" + [ -n "${PKG_FW_FILE}" ] && cp -PRv "${PKG_FW_FILE}" ${DESTDIR}/${_basedir} + fi done done - # also copy default and latest symlinks - cp -Prv ${PKG_BUILD}/firmware/{default,latest} ${DESTDIR} + # also create legacy naming symlinks + ln -s default ${DESTDIR}/critical + ln -s latest ${DESTDIR}/stable mkdir -p ${INSTALL}/usr/bin cp -PRv ${PKG_DIR}/source/rpi-eeprom-update ${INSTALL}/usr/bin diff --git a/projects/Allwinner/firmwares/brcmfmac_sdio-firmware.dat b/projects/Allwinner/firmwares/brcmfmac_sdio-firmware.dat index 4349b287e4..2fbd2899ab 100644 --- a/projects/Allwinner/firmwares/brcmfmac_sdio-firmware.dat +++ b/projects/Allwinner/firmwares/brcmfmac_sdio-firmware.dat @@ -2,3 +2,4 @@ BCM43430A1.def BCM43430A1.vim *.txt +brcmfmac43456-sdio.bin diff --git a/projects/Allwinner/linux/linux.aarch64.conf b/projects/Allwinner/linux/linux.aarch64.conf index 1323a014f5..5994eedeb3 100644 --- a/projects/Allwinner/linux/linux.aarch64.conf +++ b/projects/Allwinner/linux/linux.aarch64.conf @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/arm64 6.1.0-rc6 Kernel Configuration +# Linux/arm64 6.1.57 Kernel Configuration # CONFIG_CC_VERSION_TEXT="aarch64-none-elf-gcc-12.2.0 (GCC) 12.2.0" CONFIG_CC_IS_GCC=y @@ -338,6 +338,7 @@ CONFIG_ARCH_SUNXI=y # # ARM errata workarounds via the alternatives framework # +# CONFIG_AMPERE_ERRATUM_AC03_CPU_38 is not set CONFIG_ARM64_WORKAROUND_CLEAN_CACHE=y CONFIG_ARM64_ERRATUM_826319=y CONFIG_ARM64_ERRATUM_827319=y @@ -364,6 +365,7 @@ CONFIG_ARM64_LD_HAS_FIX_ERRATUM_843419=y # CONFIG_ARM64_ERRATUM_2054223 is not set # CONFIG_ARM64_ERRATUM_2067961 is not set # CONFIG_ARM64_ERRATUM_2441009 is not set +# CONFIG_ARM64_ERRATUM_2966298 is not set # CONFIG_CAVIUM_ERRATUM_22375 is not set # CONFIG_CAVIUM_ERRATUM_23154 is not set # CONFIG_CAVIUM_ERRATUM_27456 is not set @@ -853,6 +855,7 @@ CONFIG_SECRETMEM=y # CONFIG_ANON_VMA_NAME is not set # CONFIG_USERFAULTFD is not set # CONFIG_LRU_GEN is not set +CONFIG_LOCK_MM_AND_FIND_VMA=y # # Data Access Monitoring @@ -1709,7 +1712,7 @@ CONFIG_SCSI_DMA=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set CONFIG_BLK_DEV_SR=y -# CONFIG_CHR_DEV_SG is not set +CONFIG_CHR_DEV_SG=m CONFIG_BLK_DEV_BSG=y # CONFIG_CHR_DEV_SCH is not set CONFIG_SCSI_CONSTANTS=y @@ -3660,10 +3663,7 @@ CONFIG_MEDIA_ATTACH=y # IR I2C driver auto-selected by 'Autoselect ancillary drivers' # CONFIG_VIDEO_IR_I2C=y - -# -# Camera sensor devices -# +CONFIG_VIDEO_CAMERA_SENSOR=y # CONFIG_VIDEO_AR0521 is not set # CONFIG_VIDEO_HI556 is not set # CONFIG_VIDEO_HI846 is not set @@ -3730,7 +3730,6 @@ CONFIG_VIDEO_OV7640=m # CONFIG_VIDEO_CCS is not set # CONFIG_VIDEO_ET8EK8 is not set # CONFIG_VIDEO_M5MOLS is not set -# end of Camera sensor devices # # Lens drivers @@ -6460,7 +6459,7 @@ CONFIG_CIFS_DEBUG=y # CONFIG_CIFS_SWN_UPCALL is not set # CONFIG_CIFS_ROOT is not set # CONFIG_SMB_SERVER is not set -CONFIG_SMBFS_COMMON=y +CONFIG_SMBFS=y # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set CONFIG_NLS=y diff --git a/projects/Allwinner/linux/linux.arm.conf b/projects/Allwinner/linux/linux.arm.conf index 76fec8c764..863d613a86 100644 --- a/projects/Allwinner/linux/linux.arm.conf +++ b/projects/Allwinner/linux/linux.arm.conf @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/arm 6.1.0-rc6 Kernel Configuration +# Linux/arm 6.1.57 Kernel Configuration # CONFIG_CC_VERSION_TEXT="armv7ve-libreelec-linux-gnueabihf-gcc-12.2.0 (GCC) 12.2.0" CONFIG_CC_IS_GCC=y @@ -611,6 +611,7 @@ CONFIG_GENERIC_IDLE_POLL_SETUP=y CONFIG_ARCH_HAS_FORTIFY_SOURCE=y CONFIG_ARCH_HAS_KEEPINITRD=y CONFIG_ARCH_HAS_SET_MEMORY=y +CONFIG_ARCH_HAS_CPU_FINALIZE_INIT=y CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y CONFIG_ARCH_32BIT_OFF_T=y CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y @@ -821,6 +822,7 @@ CONFIG_KMAP_LOCAL_NON_LINEAR_PTE_ARRAY=y # CONFIG_ANON_VMA_NAME is not set # CONFIG_USERFAULTFD is not set # CONFIG_LRU_GEN is not set +CONFIG_LOCK_MM_AND_FIND_VMA=y # # Data Access Monitoring @@ -1574,7 +1576,7 @@ CONFIG_SCSI_PROC_FS=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set CONFIG_BLK_DEV_SR=y -# CONFIG_CHR_DEV_SG is not set +CONFIG_CHR_DEV_SG=m CONFIG_BLK_DEV_BSG=y # CONFIG_CHR_DEV_SCH is not set CONFIG_SCSI_CONSTANTS=y @@ -3398,9 +3400,7 @@ CONFIG_MEDIA_ATTACH=y # CONFIG_VIDEO_IR_I2C=y -# -# Camera sensor devices -# +CONFIG_VIDEO_CAMERA_SENSOR=y # CONFIG_VIDEO_AR0521 is not set # CONFIG_VIDEO_HI556 is not set # CONFIG_VIDEO_HI846 is not set @@ -3467,7 +3467,6 @@ CONFIG_VIDEO_OV7640=m # CONFIG_VIDEO_CCS is not set # CONFIG_VIDEO_ET8EK8 is not set # CONFIG_VIDEO_M5MOLS is not set -# end of Camera sensor devices # # Lens drivers @@ -6060,7 +6059,7 @@ CONFIG_CIFS_DEBUG=y # CONFIG_CIFS_SWN_UPCALL is not set # CONFIG_CIFS_ROOT is not set # CONFIG_SMB_SERVER is not set -CONFIG_SMBFS_COMMON=y +CONFIG_SMBFS=y # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set CONFIG_NLS=y diff --git a/projects/Allwinner/patches/linux/0014-drm_call_drm_atomic_helper_shutdown_at_shutdown.patch b/projects/Allwinner/patches/linux/0014-drm_call_drm_atomic_helper_shutdown_at_shutdown.patch new file mode 100644 index 0000000000..218fdae116 --- /dev/null +++ b/projects/Allwinner/patches/linux/0014-drm_call_drm_atomic_helper_shutdown_at_shutdown.patch @@ -0,0 +1,61 @@ +Subject: [PATCH] drm: Call drm_atomic_helper_shutdown() at shutdown time for misc drivers +From: Douglas Anderson +Date: Fri, 01 Sep 2023 16:39:53 -0700 +MIME-Version: 1.0 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 7bit + +Based on grepping through the source code these drivers appear to be +missing a call to drm_atomic_helper_shutdown() at system shutdown +time. Among other things, this means that if a panel is in use that it +won't be cleanly powered off at system shutdown time. + +The fact that we should call drm_atomic_helper_shutdown() in the case +of OS shutdown/restart comes straight out of the kernel doc "driver +instance overview" in drm_drv.c. + +All of the drivers in this patch were fairly straightforward to fix +since they already had a call to drm_atomic_helper_shutdown() at +remove/unbind time but were just lacking one at system shutdown. The +only hitch is that some of these drivers use the component model to +register/unregister their DRM devices. The shutdown callback is part +of the original device. The typical solution here, based on how other +DRM drivers do this, is to keep track of whether the device is bound +based on drvdata. In most cases the drvdata is the drm_device, so we +can just make sure it is NULL when the device is not bound. In some +drivers, this required minor code changes. To make things simpler, +drm_atomic_helper_shutdown() has been modified to consider a NULL +drm_device as a noop in the patch ("drm/atomic-helper: +drm_atomic_helper_shutdown(NULL) should be a noop"). + +Suggested-by: Maxime Ripard +Signed-off-by: Douglas Anderson +Acked-by: Maxime Ripard +Link: https://lore.kernel.org/r/20230901163944.RFT.2.I9115e5d094a43e687978b0699cc1fe9f2a3452ea@changeid +--- + +diff --git a/drivers/gpu/drm/sun4i/sun4i_drv.c b/drivers/gpu/drm/sun4i/sun4i_drv.c +index 6a8dfc022d3c..35d7a7ffd208 100644 +--- a/drivers/gpu/drm/sun4i/sun4i_drv.c ++++ b/drivers/gpu/drm/sun4i/sun4i_drv.c +@@ -413,6 +413,11 @@ static void sun4i_drv_remove(struct platform_device *pdev) + component_master_del(&pdev->dev, &sun4i_drv_master_ops); + } + ++static void sun4i_drv_shutdown(struct platform_device *pdev) ++{ ++ drm_atomic_helper_shutdown(platform_get_drvdata(pdev)); ++} ++ + static const struct of_device_id sun4i_drv_of_table[] = { + { .compatible = "allwinner,sun4i-a10-display-engine" }, + { .compatible = "allwinner,sun5i-a10s-display-engine" }, +@@ -437,6 +442,7 @@ MODULE_DEVICE_TABLE(of, sun4i_drv_of_table); + static struct platform_driver sun4i_drv_platform_driver = { + .probe = sun4i_drv_probe, + .remove = sun4i_drv_remove, ++ .shutdown = sun4i_drv_shutdown, + .driver = { + .name = "sun4i-drm", + .of_match_table = sun4i_drv_of_table, + diff --git a/projects/Amlogic/linux/linux.aarch64.conf b/projects/Amlogic/linux/linux.aarch64.conf index 3122a3c071..75c4284a5d 100644 --- a/projects/Amlogic/linux/linux.aarch64.conf +++ b/projects/Amlogic/linux/linux.aarch64.conf @@ -1770,7 +1770,7 @@ CONFIG_SCSI_DMA=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set CONFIG_BLK_DEV_SR=y -# CONFIG_CHR_DEV_SG is not set +CONFIG_CHR_DEV_SG=m CONFIG_BLK_DEV_BSG=y # CONFIG_CHR_DEV_SCH is not set # CONFIG_SCSI_CONSTANTS is not set diff --git a/projects/Generic/linux/linux.x86_64.conf b/projects/Generic/linux/linux.x86_64.conf index 73a6b53fa5..3e55a1e821 100644 --- a/projects/Generic/linux/linux.x86_64.conf +++ b/projects/Generic/linux/linux.x86_64.conf @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 6.1.30 Kernel Configuration +# Linux/x86 6.1.57 Kernel Configuration # CONFIG_CC_VERSION_TEXT="x86_64-libreelec-linux-gnu-gcc-12.2.0 (GCC) 12.2.0" CONFIG_CC_IS_GCC=y @@ -472,7 +472,9 @@ CONFIG_RETHUNK=y CONFIG_CPU_UNRET_ENTRY=y CONFIG_CPU_IBPB_ENTRY=y CONFIG_CPU_IBRS_ENTRY=y +CONFIG_CPU_SRSO=y # CONFIG_SLS is not set +# CONFIG_GDS_FORCE_MITIGATION is not set CONFIG_ARCH_HAS_ADD_PAGES=y CONFIG_ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE=y @@ -666,6 +668,7 @@ CONFIG_GENERIC_SMP_IDLE_THREAD=y CONFIG_ARCH_HAS_FORTIFY_SOURCE=y CONFIG_ARCH_HAS_SET_MEMORY=y CONFIG_ARCH_HAS_SET_DIRECT_MAP=y +CONFIG_ARCH_HAS_CPU_FINALIZE_INIT=y CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y CONFIG_ARCH_WANTS_NO_INSTR=y @@ -969,6 +972,7 @@ CONFIG_SECRETMEM=y # CONFIG_ANON_VMA_NAME is not set # CONFIG_USERFAULTFD is not set # CONFIG_LRU_GEN is not set +CONFIG_LOCK_MM_AND_FIND_VMA=y # # Data Access Monitoring @@ -1853,7 +1857,7 @@ CONFIG_SCSI_DMA=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set CONFIG_BLK_DEV_SR=y -CONFIG_CHR_DEV_SG=y +CONFIG_CHR_DEV_SG=m CONFIG_BLK_DEV_BSG=y # CONFIG_CHR_DEV_SCH is not set # CONFIG_SCSI_CONSTANTS is not set @@ -4066,9 +4070,7 @@ CONFIG_MEDIA_ATTACH=y # CONFIG_VIDEO_IR_I2C=m -# -# Camera sensor devices -# +CONFIG_VIDEO_CAMERA_SENSOR=y # CONFIG_VIDEO_AR0521 is not set # CONFIG_VIDEO_HI556 is not set # CONFIG_VIDEO_HI846 is not set @@ -4130,7 +4132,6 @@ CONFIG_VIDEO_IR_I2C=m # CONFIG_VIDEO_CCS is not set # CONFIG_VIDEO_ET8EK8 is not set # CONFIG_VIDEO_M5MOLS is not set -# end of Camera sensor devices # # Lens drivers @@ -6402,7 +6403,7 @@ CONFIG_CIFS_DEBUG=y CONFIG_CIFS_FSCACHE=y # CONFIG_CIFS_ROOT is not set # CONFIG_SMB_SERVER is not set -CONFIG_SMBFS_COMMON=y +CONFIG_SMBFS=y # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set CONFIG_NLS=y diff --git a/projects/NXP/devices/iMX6/linux/linux.arm.conf b/projects/NXP/devices/iMX6/linux/linux.arm.conf index 48ba320fd5..29c5df0dde 100644 --- a/projects/NXP/devices/iMX6/linux/linux.arm.conf +++ b/projects/NXP/devices/iMX6/linux/linux.arm.conf @@ -1818,7 +1818,7 @@ CONFIG_SCSI_DMA=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set # CONFIG_BLK_DEV_SR is not set -# CONFIG_CHR_DEV_SG is not set +CONFIG_CHR_DEV_SG=m # CONFIG_BLK_DEV_BSG is not set # CONFIG_CHR_DEV_SCH is not set CONFIG_SCSI_CONSTANTS=y diff --git a/projects/NXP/devices/iMX8/linux/linux.aarch64.conf b/projects/NXP/devices/iMX8/linux/linux.aarch64.conf index 49ff57d6da..c0ee96e16a 100644 --- a/projects/NXP/devices/iMX8/linux/linux.aarch64.conf +++ b/projects/NXP/devices/iMX8/linux/linux.aarch64.conf @@ -1775,7 +1775,7 @@ CONFIG_SCSI_DMA=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set CONFIG_BLK_DEV_SR=y -# CONFIG_CHR_DEV_SG is not set +CONFIG_CHR_DEV_SG=m CONFIG_BLK_DEV_BSG=y # CONFIG_CHR_DEV_SCH is not set # CONFIG_SCSI_CONSTANTS is not set diff --git a/projects/Qualcomm/devices/Dragonboard/linux/linux.aarch64.conf b/projects/Qualcomm/devices/Dragonboard/linux/linux.aarch64.conf index 3df420b438..4e3f207144 100644 --- a/projects/Qualcomm/devices/Dragonboard/linux/linux.aarch64.conf +++ b/projects/Qualcomm/devices/Dragonboard/linux/linux.aarch64.conf @@ -1933,7 +1933,7 @@ CONFIG_SCSI_DMA=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set # CONFIG_BLK_DEV_SR is not set -# CONFIG_CHR_DEV_SG is not set +CONFIG_CHR_DEV_SG=m CONFIG_BLK_DEV_BSG=y # CONFIG_CHR_DEV_SCH is not set # CONFIG_SCSI_CONSTANTS is not set diff --git a/projects/RPi/devices/RPi/linux/linux.arm.conf b/projects/RPi/devices/RPi/linux/linux.arm.conf index 76ab802670..8f7f0f32e6 100644 --- a/projects/RPi/devices/RPi/linux/linux.arm.conf +++ b/projects/RPi/devices/RPi/linux/linux.arm.conf @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/arm 6.1.38 Kernel Configuration +# Linux/arm 6.1.57 Kernel Configuration # CONFIG_CC_VERSION_TEXT="arm-linux-gnueabihf-gcc (GCC) 13.0.0 20220604 (experimental) [master revision aec868578d8515763d75693c1fdfbc30ff0a1e68]" CONFIG_CC_IS_GCC=y @@ -496,6 +496,7 @@ CONFIG_GENERIC_IDLE_POLL_SETUP=y CONFIG_ARCH_HAS_FORTIFY_SOURCE=y CONFIG_ARCH_HAS_KEEPINITRD=y CONFIG_ARCH_HAS_SET_MEMORY=y +CONFIG_ARCH_HAS_CPU_FINALIZE_INIT=y CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y CONFIG_ARCH_32BIT_OFF_T=y CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y @@ -1065,8 +1066,6 @@ CONFIG_NET_SCH_FQ_CODEL=y # CONFIG_NET_CLS_ROUTE4 is not set # CONFIG_NET_CLS_FW is not set # CONFIG_NET_CLS_U32 is not set -# CONFIG_NET_CLS_RSVP is not set -# CONFIG_NET_CLS_RSVP6 is not set # CONFIG_NET_CLS_FLOW is not set # CONFIG_NET_CLS_CGROUP is not set # CONFIG_NET_CLS_BPF is not set @@ -1396,7 +1395,7 @@ CONFIG_SCSI_DMA=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set CONFIG_BLK_DEV_SR=y -# CONFIG_CHR_DEV_SG is not set +CONFIG_CHR_DEV_SG=m CONFIG_BLK_DEV_BSG=y # CONFIG_CHR_DEV_SCH is not set # CONFIG_SCSI_CONSTANTS is not set @@ -1943,6 +1942,7 @@ CONFIG_INPUT_GPIO_ROTARY_ENCODER=m # CONFIG_INPUT_DRV260X_HAPTICS is not set # CONFIG_INPUT_DRV2665_HAPTICS is not set # CONFIG_INPUT_DRV2667_HAPTICS is not set +# CONFIG_INPUT_RASPBERRYPI_BUTTON is not set CONFIG_RMI4_CORE=y # CONFIG_RMI4_I2C is not set # CONFIG_RMI4_SPI is not set @@ -1981,7 +1981,6 @@ CONFIG_SERIO_SERPORT=y CONFIG_BRCM_CHAR_DRIVERS=y CONFIG_BCM2708_VCMEM=y CONFIG_BCM_VCIO=y -CONFIG_BCM2835_DEVGPIOMEM=m CONFIG_BCM2835_SMI_DEV=m # CONFIG_RPIVID_MEM is not set CONFIG_TTY=y @@ -2066,6 +2065,7 @@ CONFIG_DEVMEM=y # CONFIG_XILLYUSB is not set CONFIG_RANDOM_TRUST_CPU=y CONFIG_RANDOM_TRUST_BOOTLOADER=y +CONFIG_RASPBERRYPI_GPIOMEM=y # end of Character devices # @@ -2214,6 +2214,8 @@ CONFIG_GENERIC_PINCONF=y # CONFIG_PINCTRL_SINGLE is not set # CONFIG_PINCTRL_STMFX is not set # CONFIG_PINCTRL_SX150X is not set +# CONFIG_PINCTRL_RP1 is not set +# CONFIG_PINCTRL_BCM2712 is not set CONFIG_PINCTRL_BCM2835=y # @@ -2238,6 +2240,7 @@ CONFIG_GPIO_CDEV_V1=y # CONFIG_GPIO_ALTERA is not set CONFIG_GPIO_RASPBERRYPI_EXP=y CONFIG_GPIO_BCM_VIRT=y +# CONFIG_GPIO_BRCMSTB is not set # CONFIG_GPIO_CADENCE is not set # CONFIG_GPIO_DWAPB is not set # CONFIG_GPIO_FTGPIO010 is not set @@ -2881,11 +2884,14 @@ CONFIG_DVB_CORE=m # Video4Linux options # CONFIG_VIDEO_V4L2_I2C=y +CONFIG_VIDEO_V4L2_SUBDEV_API=y # CONFIG_VIDEO_ADV_DEBUG is not set # CONFIG_VIDEO_FIXED_MINOR_RANGES is not set CONFIG_VIDEO_TUNER=m CONFIG_V4L2_MEM2MEM_DEV=m # CONFIG_V4L2_FLASH_LED_CLASS is not set +CONFIG_V4L2_FWNODE=m +CONFIG_V4L2_ASYNC=m # end of Video4Linux options # @@ -3074,6 +3080,12 @@ CONFIG_MEDIA_PLATFORM_DRIVERS=y # Qualcomm media platform drivers # +# +# Raspberry Pi media platform drivers +# +# CONFIG_VIDEO_RASPBERRYPI_PISP_BE is not set +# CONFIG_VIDEO_RP1_CFE is not set + # # Renesas media platform drivers # @@ -3145,10 +3157,7 @@ CONFIG_MEDIA_ATTACH=y # IR I2C driver auto-selected by 'Autoselect ancillary drivers' # CONFIG_VIDEO_IR_I2C=m - -# -# Camera sensor devices -# +CONFIG_VIDEO_CAMERA_SENSOR=y # CONFIG_VIDEO_AR0521 is not set # CONFIG_VIDEO_ARDUCAM_64MP is not set # CONFIG_VIDEO_ARDUCAM_PIVARIETY is not set @@ -3167,6 +3176,7 @@ CONFIG_VIDEO_IR_I2C=m # CONFIG_VIDEO_IMX335 is not set # CONFIG_VIDEO_IMX355 is not set # CONFIG_VIDEO_IMX412 is not set +# CONFIG_VIDEO_IMX477 is not set # CONFIG_VIDEO_IMX519 is not set # CONFIG_VIDEO_IMX708 is not set # CONFIG_VIDEO_MT9M001 is not set @@ -3221,7 +3231,6 @@ CONFIG_VIDEO_IR_I2C=m # CONFIG_VIDEO_CCS is not set # CONFIG_VIDEO_ET8EK8 is not set # CONFIG_VIDEO_M5MOLS is not set -# end of Camera sensor devices # # Lens drivers @@ -3258,7 +3267,6 @@ CONFIG_VIDEO_MSP3400=m # CONFIG_VIDEO_TLV320AIC23B is not set # CONFIG_VIDEO_TVAUDIO is not set # CONFIG_VIDEO_UDA1342 is not set -# CONFIG_VIDEO_IMX477 is not set # CONFIG_VIDEO_VP27SMPX is not set # CONFIG_VIDEO_WM8739 is not set CONFIG_VIDEO_WM8775=m @@ -3749,6 +3757,9 @@ CONFIG_DRM_TOSHIBA_TC358762=y # CONFIG_DRM_V3D is not set CONFIG_DRM_VC4=y CONFIG_DRM_VC4_HDMI_CEC=y +# CONFIG_DRM_RP1_DSI is not set +# CONFIG_DRM_RP1_DPI is not set +# CONFIG_DRM_RP1_VEC is not set # CONFIG_DRM_ETNAVIV is not set # CONFIG_DRM_LOGICVC is not set # CONFIG_DRM_ARCPGU is not set @@ -5024,6 +5035,7 @@ CONFIG_PWM_BCM2835=m # CONFIG_PWM_FSL_FTM is not set # CONFIG_PWM_PCA9685 is not set CONFIG_PWM_RASPBERRYPI_POE=m +# CONFIG_PWM_RP1 is not set # CONFIG_PWM_XILINX is not set # @@ -5037,6 +5049,7 @@ CONFIG_BRCMSTB_L2_IRQ=y # CONFIG_IPACK_BUS is not set CONFIG_RESET_CONTROLLER=y +# CONFIG_RESET_BRCMSTB is not set # CONFIG_RESET_RASPBERRYPI is not set # CONFIG_RESET_SIMPLE is not set # CONFIG_RESET_TI_SYSCON is not set @@ -5052,6 +5065,7 @@ CONFIG_RESET_CONTROLLER=y # PHY drivers for Broadcom platforms # # CONFIG_BCM_KONA_USB2_PHY is not set +# CONFIG_PHY_BRCM_USB is not set # end of PHY drivers for Broadcom platforms # CONFIG_PHY_CADENCE_TORRENT is not set @@ -5169,7 +5183,7 @@ CONFIG_DNOTIFY=y CONFIG_INOTIFY_USER=y CONFIG_FANOTIFY=y # CONFIG_QUOTA is not set -CONFIG_AUTOFS4_FS=y +# CONFIG_AUTOFS4_FS is not set CONFIG_AUTOFS_FS=y CONFIG_FUSE_FS=m # CONFIG_CUSE is not set diff --git a/projects/RPi/devices/RPi2/linux/linux.arm.conf b/projects/RPi/devices/RPi2/linux/linux.arm.conf index 0d85078021..9ac3225cfa 100644 --- a/projects/RPi/devices/RPi2/linux/linux.arm.conf +++ b/projects/RPi/devices/RPi2/linux/linux.arm.conf @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/arm 6.1.38 Kernel Configuration +# Linux/arm 6.1.57 Kernel Configuration # CONFIG_CC_VERSION_TEXT="arm-linux-gnueabihf-gcc (GCC) 13.0.0 20220604 (experimental) [master revision aec868578d8515763d75693c1fdfbc30ff0a1e68]" CONFIG_CC_IS_GCC=y @@ -608,6 +608,7 @@ CONFIG_GENERIC_IDLE_POLL_SETUP=y CONFIG_ARCH_HAS_FORTIFY_SOURCE=y CONFIG_ARCH_HAS_KEEPINITRD=y CONFIG_ARCH_HAS_SET_MEMORY=y +CONFIG_ARCH_HAS_CPU_FINALIZE_INIT=y CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y CONFIG_ARCH_32BIT_OFF_T=y CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y @@ -1247,8 +1248,6 @@ CONFIG_NET_CLS=y # CONFIG_NET_CLS_ROUTE4 is not set # CONFIG_NET_CLS_FW is not set # CONFIG_NET_CLS_U32 is not set -# CONFIG_NET_CLS_RSVP is not set -# CONFIG_NET_CLS_RSVP6 is not set # CONFIG_NET_CLS_FLOW is not set CONFIG_NET_CLS_CGROUP=m # CONFIG_NET_CLS_BPF is not set @@ -1588,7 +1587,7 @@ CONFIG_SCSI_DMA=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set CONFIG_BLK_DEV_SR=y -# CONFIG_CHR_DEV_SG is not set +CONFIG_CHR_DEV_SG=m CONFIG_BLK_DEV_BSG=y # CONFIG_CHR_DEV_SCH is not set # CONFIG_SCSI_CONSTANTS is not set @@ -2168,6 +2167,7 @@ CONFIG_INPUT_GPIO_ROTARY_ENCODER=m # CONFIG_INPUT_DRV260X_HAPTICS is not set # CONFIG_INPUT_DRV2665_HAPTICS is not set # CONFIG_INPUT_DRV2667_HAPTICS is not set +# CONFIG_INPUT_RASPBERRYPI_BUTTON is not set CONFIG_RMI4_CORE=y # CONFIG_RMI4_I2C is not set # CONFIG_RMI4_SPI is not set @@ -2206,7 +2206,6 @@ CONFIG_SERIO_SERPORT=y CONFIG_BRCM_CHAR_DRIVERS=y CONFIG_BCM2708_VCMEM=y CONFIG_BCM_VCIO=y -CONFIG_BCM2835_DEVGPIOMEM=m CONFIG_BCM2835_SMI_DEV=m # CONFIG_RPIVID_MEM is not set CONFIG_TTY=y @@ -2291,6 +2290,7 @@ CONFIG_DEVMEM=y # CONFIG_XILLYUSB is not set CONFIG_RANDOM_TRUST_CPU=y CONFIG_RANDOM_TRUST_BOOTLOADER=y +CONFIG_RASPBERRYPI_GPIOMEM=y # end of Character devices # @@ -2439,6 +2439,8 @@ CONFIG_GENERIC_PINCONF=y # CONFIG_PINCTRL_SINGLE is not set # CONFIG_PINCTRL_STMFX is not set # CONFIG_PINCTRL_SX150X is not set +# CONFIG_PINCTRL_RP1 is not set +# CONFIG_PINCTRL_BCM2712 is not set CONFIG_PINCTRL_BCM2835=y # @@ -2463,6 +2465,7 @@ CONFIG_GPIO_CDEV_V1=y # CONFIG_GPIO_ALTERA is not set CONFIG_GPIO_RASPBERRYPI_EXP=y CONFIG_GPIO_BCM_VIRT=y +# CONFIG_GPIO_BRCMSTB is not set # CONFIG_GPIO_CADENCE is not set # CONFIG_GPIO_DWAPB is not set # CONFIG_GPIO_FTGPIO010 is not set @@ -3113,11 +3116,14 @@ CONFIG_DVB_CORE=m # Video4Linux options # CONFIG_VIDEO_V4L2_I2C=y +CONFIG_VIDEO_V4L2_SUBDEV_API=y # CONFIG_VIDEO_ADV_DEBUG is not set # CONFIG_VIDEO_FIXED_MINOR_RANGES is not set CONFIG_VIDEO_TUNER=m CONFIG_V4L2_MEM2MEM_DEV=m # CONFIG_V4L2_FLASH_LED_CLASS is not set +CONFIG_V4L2_FWNODE=m +CONFIG_V4L2_ASYNC=m # end of Video4Linux options # @@ -3306,6 +3312,12 @@ CONFIG_MEDIA_PLATFORM_DRIVERS=y # Qualcomm media platform drivers # +# +# Raspberry Pi media platform drivers +# +# CONFIG_VIDEO_RASPBERRYPI_PISP_BE is not set +# CONFIG_VIDEO_RP1_CFE is not set + # # Renesas media platform drivers # @@ -3377,10 +3389,7 @@ CONFIG_MEDIA_ATTACH=y # IR I2C driver auto-selected by 'Autoselect ancillary drivers' # CONFIG_VIDEO_IR_I2C=m - -# -# Camera sensor devices -# +CONFIG_VIDEO_CAMERA_SENSOR=y # CONFIG_VIDEO_AR0521 is not set # CONFIG_VIDEO_ARDUCAM_64MP is not set # CONFIG_VIDEO_ARDUCAM_PIVARIETY is not set @@ -3399,6 +3408,7 @@ CONFIG_VIDEO_IR_I2C=m # CONFIG_VIDEO_IMX335 is not set # CONFIG_VIDEO_IMX355 is not set # CONFIG_VIDEO_IMX412 is not set +# CONFIG_VIDEO_IMX477 is not set # CONFIG_VIDEO_IMX519 is not set # CONFIG_VIDEO_IMX708 is not set # CONFIG_VIDEO_MT9M001 is not set @@ -3453,7 +3463,6 @@ CONFIG_VIDEO_IR_I2C=m # CONFIG_VIDEO_CCS is not set # CONFIG_VIDEO_ET8EK8 is not set # CONFIG_VIDEO_M5MOLS is not set -# end of Camera sensor devices # # Lens drivers @@ -3490,7 +3499,6 @@ CONFIG_VIDEO_MSP3400=m # CONFIG_VIDEO_TLV320AIC23B is not set # CONFIG_VIDEO_TVAUDIO is not set # CONFIG_VIDEO_UDA1342 is not set -# CONFIG_VIDEO_IMX477 is not set # CONFIG_VIDEO_VP27SMPX is not set # CONFIG_VIDEO_WM8739 is not set CONFIG_VIDEO_WM8775=m @@ -3982,6 +3990,9 @@ CONFIG_DRM_TOSHIBA_TC358762=y # CONFIG_DRM_V3D is not set CONFIG_DRM_VC4=y CONFIG_DRM_VC4_HDMI_CEC=y +# CONFIG_DRM_RP1_DSI is not set +# CONFIG_DRM_RP1_DPI is not set +# CONFIG_DRM_RP1_VEC is not set # CONFIG_DRM_ETNAVIV is not set # CONFIG_DRM_LOGICVC is not set # CONFIG_DRM_ARCPGU is not set @@ -5260,6 +5271,7 @@ CONFIG_PWM_BCM2835=m # CONFIG_PWM_FSL_FTM is not set # CONFIG_PWM_PCA9685 is not set CONFIG_PWM_RASPBERRYPI_POE=m +# CONFIG_PWM_RP1 is not set # CONFIG_PWM_XILINX is not set # @@ -5269,12 +5281,14 @@ CONFIG_IRQCHIP=y CONFIG_ARM_GIC=y CONFIG_ARM_GIC_MAX_NR=1 # CONFIG_AL_FIC is not set +# CONFIG_BCM2712_MIP is not set CONFIG_BRCMSTB_L2_IRQ=y # CONFIG_XILINX_INTC is not set # end of IRQ chip support # CONFIG_IPACK_BUS is not set CONFIG_RESET_CONTROLLER=y +# CONFIG_RESET_BRCMSTB is not set # CONFIG_RESET_RASPBERRYPI is not set # CONFIG_RESET_SIMPLE is not set # CONFIG_RESET_TI_SYSCON is not set @@ -5290,6 +5304,7 @@ CONFIG_RESET_CONTROLLER=y # PHY drivers for Broadcom platforms # # CONFIG_BCM_KONA_USB2_PHY is not set +# CONFIG_PHY_BRCM_USB is not set # end of PHY drivers for Broadcom platforms # CONFIG_PHY_CADENCE_TORRENT is not set @@ -5411,7 +5426,7 @@ CONFIG_DNOTIFY=y CONFIG_INOTIFY_USER=y CONFIG_FANOTIFY=y # CONFIG_QUOTA is not set -CONFIG_AUTOFS4_FS=y +# CONFIG_AUTOFS4_FS is not set CONFIG_AUTOFS_FS=y CONFIG_FUSE_FS=m # CONFIG_CUSE is not set diff --git a/projects/RPi/devices/RPi4/linux/linux.aarch64.conf b/projects/RPi/devices/RPi4/linux/linux.aarch64.conf index 17bba563ef..6419a4e477 100644 --- a/projects/RPi/devices/RPi4/linux/linux.aarch64.conf +++ b/projects/RPi/devices/RPi4/linux/linux.aarch64.conf @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/arm64 6.1.38 Kernel Configuration +# Linux/arm64 6.1.57 Kernel Configuration # CONFIG_CC_VERSION_TEXT="aarch64-linux-gnu-gcc (GCC) 13.0.0 20220604 (experimental) [master revision aec868578d8515763d75693c1fdfbc30ff0a1e68]" CONFIG_CC_IS_GCC=y @@ -327,6 +327,7 @@ CONFIG_ARCH_BCM2835=y # # ARM errata workarounds via the alternatives framework # +CONFIG_AMPERE_ERRATUM_AC03_CPU_38=y CONFIG_ARM64_WORKAROUND_CLEAN_CACHE=y CONFIG_ARM64_ERRATUM_826319=y CONFIG_ARM64_ERRATUM_827319=y @@ -357,6 +358,7 @@ CONFIG_ARM64_ERRATUM_2054223=y CONFIG_ARM64_ERRATUM_2067961=y CONFIG_ARM64_ERRATUM_2441009=y CONFIG_ARM64_ERRATUM_2457168=y +CONFIG_ARM64_ERRATUM_2966298=y CONFIG_CAVIUM_ERRATUM_22375=y CONFIG_CAVIUM_ERRATUM_23154=y CONFIG_CAVIUM_ERRATUM_27456=y @@ -1329,8 +1331,6 @@ CONFIG_NET_CLS=y # CONFIG_NET_CLS_ROUTE4 is not set # CONFIG_NET_CLS_FW is not set # CONFIG_NET_CLS_U32 is not set -# CONFIG_NET_CLS_RSVP is not set -# CONFIG_NET_CLS_RSVP6 is not set # CONFIG_NET_CLS_FLOW is not set CONFIG_NET_CLS_CGROUP=m # CONFIG_NET_CLS_BPF is not set @@ -1798,7 +1798,7 @@ CONFIG_SCSI_DMA=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set CONFIG_BLK_DEV_SR=y -# CONFIG_CHR_DEV_SG is not set +CONFIG_CHR_DEV_SG=m CONFIG_BLK_DEV_BSG=y # CONFIG_CHR_DEV_SCH is not set # CONFIG_SCSI_CONSTANTS is not set @@ -2693,6 +2693,7 @@ CONFIG_INPUT_GPIO_ROTARY_ENCODER=m # CONFIG_INPUT_DRV260X_HAPTICS is not set # CONFIG_INPUT_DRV2665_HAPTICS is not set # CONFIG_INPUT_DRV2667_HAPTICS is not set +# CONFIG_INPUT_RASPBERRYPI_BUTTON is not set CONFIG_RMI4_CORE=y # CONFIG_RMI4_I2C is not set # CONFIG_RMI4_SPI is not set @@ -2732,7 +2733,6 @@ CONFIG_SERIO_SERPORT=y CONFIG_BRCM_CHAR_DRIVERS=y CONFIG_BCM2708_VCMEM=y CONFIG_BCM_VCIO=y -CONFIG_BCM2835_DEVGPIOMEM=m CONFIG_BCM2835_SMI_DEV=m # CONFIG_RPIVID_MEM is not set CONFIG_TTY=y @@ -2826,6 +2826,7 @@ CONFIG_DEVPORT=y # CONFIG_XILLYUSB is not set CONFIG_RANDOM_TRUST_CPU=y CONFIG_RANDOM_TRUST_BOOTLOADER=y +CONFIG_RASPBERRYPI_GPIOMEM=y # end of Character devices # @@ -2999,6 +3000,8 @@ CONFIG_GENERIC_PINCONF=y # CONFIG_PINCTRL_SINGLE is not set # CONFIG_PINCTRL_STMFX is not set # CONFIG_PINCTRL_SX150X is not set +# CONFIG_PINCTRL_RP1 is not set +# CONFIG_PINCTRL_BCM2712 is not set CONFIG_PINCTRL_BCM2835=y # @@ -3022,6 +3025,7 @@ CONFIG_GPIO_CDEV_V1=y # CONFIG_GPIO_ALTERA is not set CONFIG_GPIO_RASPBERRYPI_EXP=y CONFIG_GPIO_BCM_VIRT=y +# CONFIG_GPIO_BRCMSTB is not set # CONFIG_GPIO_CADENCE is not set # CONFIG_GPIO_DWAPB is not set # CONFIG_GPIO_EXAR is not set @@ -3561,6 +3565,7 @@ CONFIG_MFD_WM5102=y # CONFIG_MFD_QCOM_PM8008 is not set # CONFIG_RAVE_SP_CORE is not set # CONFIG_MFD_INTEL_M10_BMC is not set +# CONFIG_MFD_RP1 is not set # CONFIG_MFD_RSMU_I2C is not set # CONFIG_MFD_RSMU_SPI is not set # end of Multifunction device drivers @@ -3703,11 +3708,14 @@ CONFIG_DVB_CORE=m # Video4Linux options # CONFIG_VIDEO_V4L2_I2C=y +CONFIG_VIDEO_V4L2_SUBDEV_API=y # CONFIG_VIDEO_ADV_DEBUG is not set # CONFIG_VIDEO_FIXED_MINOR_RANGES is not set CONFIG_VIDEO_TUNER=m CONFIG_V4L2_MEM2MEM_DEV=m # CONFIG_V4L2_FLASH_LED_CLASS is not set +CONFIG_V4L2_FWNODE=m +CONFIG_V4L2_ASYNC=m # end of Video4Linux options # @@ -3900,6 +3908,12 @@ CONFIG_MEDIA_PLATFORM_DRIVERS=y # Qualcomm media platform drivers # +# +# Raspberry Pi media platform drivers +# +# CONFIG_VIDEO_RASPBERRYPI_PISP_BE is not set +# CONFIG_VIDEO_RP1_CFE is not set + # # Renesas media platform drivers # @@ -3971,10 +3985,7 @@ CONFIG_MEDIA_ATTACH=y # IR I2C driver auto-selected by 'Autoselect ancillary drivers' # CONFIG_VIDEO_IR_I2C=m - -# -# Camera sensor devices -# +CONFIG_VIDEO_CAMERA_SENSOR=y # CONFIG_VIDEO_AR0521 is not set # CONFIG_VIDEO_ARDUCAM_64MP is not set # CONFIG_VIDEO_ARDUCAM_PIVARIETY is not set @@ -3993,6 +4004,7 @@ CONFIG_VIDEO_IR_I2C=m # CONFIG_VIDEO_IMX335 is not set # CONFIG_VIDEO_IMX355 is not set # CONFIG_VIDEO_IMX412 is not set +# CONFIG_VIDEO_IMX477 is not set # CONFIG_VIDEO_IMX519 is not set # CONFIG_VIDEO_IMX708 is not set # CONFIG_VIDEO_MT9M001 is not set @@ -4047,7 +4059,6 @@ CONFIG_VIDEO_IR_I2C=m # CONFIG_VIDEO_CCS is not set # CONFIG_VIDEO_ET8EK8 is not set # CONFIG_VIDEO_M5MOLS is not set -# end of Camera sensor devices # # Lens drivers @@ -4084,7 +4095,6 @@ CONFIG_VIDEO_MSP3400=m # CONFIG_VIDEO_TLV320AIC23B is not set # CONFIG_VIDEO_TVAUDIO is not set # CONFIG_VIDEO_UDA1342 is not set -# CONFIG_VIDEO_IMX477 is not set # CONFIG_VIDEO_VP27SMPX is not set # CONFIG_VIDEO_WM8739 is not set CONFIG_VIDEO_WM8775=m @@ -4577,6 +4587,9 @@ CONFIG_DRM_TOSHIBA_TC358762=y CONFIG_DRM_V3D=y CONFIG_DRM_VC4=y CONFIG_DRM_VC4_HDMI_CEC=y +# CONFIG_DRM_RP1_DSI is not set +# CONFIG_DRM_RP1_DPI is not set +# CONFIG_DRM_RP1_VEC is not set # CONFIG_DRM_ETNAVIV is not set # CONFIG_DRM_HISI_HIBMC is not set # CONFIG_DRM_HISI_KIRIN is not set @@ -5837,6 +5850,7 @@ CONFIG_COMMON_CLK=y # CONFIG_LMK04832 is not set # CONFIG_COMMON_CLK_MAX9485 is not set +# CONFIG_COMMON_CLK_RP1 is not set CONFIG_COMMON_CLK_HIFIBERRY_DACPLUSHD=m CONFIG_COMMON_CLK_HIFIBERRY_DACPRO=m # CONFIG_COMMON_CLK_SI5341 is not set @@ -5980,6 +5994,7 @@ CONFIG_PWM_BCM2835=m # CONFIG_PWM_FSL_FTM is not set # CONFIG_PWM_PCA9685 is not set CONFIG_PWM_RASPBERRYPI_POE=m +# CONFIG_PWM_RP1 is not set # CONFIG_PWM_XILINX is not set # @@ -5993,6 +6008,7 @@ CONFIG_ARM_GIC_V3=y CONFIG_ARM_GIC_V3_ITS=y CONFIG_ARM_GIC_V3_ITS_PCI=y # CONFIG_AL_FIC is not set +# CONFIG_BCM2712_MIP is not set CONFIG_BRCMSTB_L2_IRQ=y # CONFIG_XILINX_INTC is not set CONFIG_PARTITION_PERCPU=y @@ -6000,6 +6016,7 @@ CONFIG_PARTITION_PERCPU=y # CONFIG_IPACK_BUS is not set CONFIG_RESET_CONTROLLER=y +# CONFIG_RESET_BRCMSTB is not set CONFIG_RESET_RASPBERRYPI=y CONFIG_RESET_SIMPLE=y # CONFIG_RESET_TI_SYSCON is not set @@ -6016,6 +6033,7 @@ CONFIG_RESET_SIMPLE=y # PHY drivers for Broadcom platforms # # CONFIG_BCM_KONA_USB2_PHY is not set +# CONFIG_PHY_BRCM_USB is not set # end of PHY drivers for Broadcom platforms # CONFIG_PHY_CADENCE_TORRENT is not set @@ -6145,7 +6163,7 @@ CONFIG_DNOTIFY=y CONFIG_INOTIFY_USER=y CONFIG_FANOTIFY=y # CONFIG_QUOTA is not set -CONFIG_AUTOFS4_FS=y +# CONFIG_AUTOFS4_FS is not set CONFIG_AUTOFS_FS=y CONFIG_FUSE_FS=m # CONFIG_CUSE is not set diff --git a/projects/RPi/devices/RPi5/config/config.txt b/projects/RPi/devices/RPi5/config/config.txt new file mode 100644 index 0000000000..e678c92b83 --- /dev/null +++ b/projects/RPi/devices/RPi5/config/config.txt @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# Copyright (C) 2009-2014 Stephan Raue (stephan@openelec.tv) +# Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) +################################################################################ +# Bootloader configuration +# config.txt version v1 (do not remove or change this line!) +################################################################################ +# For more options and information see +# http://rpf.io/configtxt +################################################################################ + +# Don't send initial active source message. +# Avoids bringing CEC (enabled TV) out of standby and channel switch when +# rebooting. +hdmi_ignore_cec_init=1 + +[all] +################################################################################ +# Use distroconfig-composite.txt instead of distroconfig.txt to enable +# composite video output. +# The composite video mode needs to be configured in cmdline.txt: +# For PAL add: video=Composite-1:720x576@50ie +# For NTSC add: video=Composite-1:720x480@60ie +################################################################################ +include distroconfig.txt +#include distroconfig-composite.txt + +# uncomment to enable infrared remote receiver connected to GPIO 18 +#dtoverlay=gpio-ir,gpio_pin=18 + diff --git a/projects/RPi/devices/RPi5/config/distroconfig-composite.txt b/projects/RPi/devices/RPi5/config/distroconfig-composite.txt new file mode 100644 index 0000000000..26625be562 --- /dev/null +++ b/projects/RPi/devices/RPi5/config/distroconfig-composite.txt @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# Copyright (C) 2023-present Team LibreELEC (https://libreelec.tv) + +# WARNING: DO NOT EDIT THIS FILE - IT WILL BE OVERWRITTEN WHEN UPGRADING! +arm_boost=1 +arm_64bit=1 +kernel=kernel.img +display_auto_detect=1 +enable_tvout=1 +dtoverlay=vc4-kms-v3d,cma-512,composite=1 +dtoverlay= +disable_overscan=1 +disable_fw_kms_setup=1 +max_framebuffers=0 diff --git a/projects/RPi/devices/RPi5/config/distroconfig.txt b/projects/RPi/devices/RPi5/config/distroconfig.txt new file mode 100644 index 0000000000..10df6ca5e3 --- /dev/null +++ b/projects/RPi/devices/RPi5/config/distroconfig.txt @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# Copyright (C) 2019-present Team LibreELEC (https://libreelec.tv) + +# WARNING: DO NOT EDIT THIS FILE - IT WILL BE OVERWRITTEN WHEN UPGRADING! +arm_boost=1 +arm_64bit=1 +kernel=kernel.img +display_auto_detect=1 +dtoverlay=vc4-kms-v3d,cma-512 +dtoverlay= +disable_overscan=1 +disable_fw_kms_setup=1 +max_framebuffers=0 diff --git a/projects/RPi/devices/RPi5/kodi/appliance.xml b/projects/RPi/devices/RPi5/kodi/appliance.xml new file mode 100644 index 0000000000..60b5e99502 --- /dev/null +++ b/projects/RPi/devices/RPi5/kodi/appliance.xml @@ -0,0 +1,14 @@ + + + +

+ + + + ALSA:hdmi:CARD=vc4hdmi0,DEV=0 + + + +
+ + diff --git a/projects/RPi/devices/RPi5/linux/linux.aarch64.conf b/projects/RPi/devices/RPi5/linux/linux.aarch64.conf new file mode 100644 index 0000000000..74562c1641 --- /dev/null +++ b/projects/RPi/devices/RPi5/linux/linux.aarch64.conf @@ -0,0 +1,7144 @@ +# +# Automatically generated file; DO NOT EDIT. +# Linux/arm64 6.1.57 Kernel Configuration +# +CONFIG_CC_VERSION_TEXT="aarch64-linux-gnu-gcc (GCC) 13.0.0 20220604 (experimental) [master revision aec868578d8515763d75693c1fdfbc30ff0a1e68]" +CONFIG_CC_IS_GCC=y +CONFIG_GCC_VERSION=130000 +CONFIG_CLANG_VERSION=0 +CONFIG_AS_IS_GNU=y +CONFIG_AS_VERSION=23850 +CONFIG_LD_IS_BFD=y +CONFIG_LD_VERSION=23850 +CONFIG_LLD_VERSION=0 +CONFIG_CC_CAN_LINK=y +CONFIG_CC_CAN_LINK_STATIC=y +CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y +CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT=y +CONFIG_CC_HAS_ASM_INLINE=y +CONFIG_CC_HAS_NO_PROFILE_FN_ATTR=y +CONFIG_PAHOLE_VERSION=0 +CONFIG_IRQ_WORK=y +CONFIG_BUILDTIME_TABLE_SORT=y +CONFIG_THREAD_INFO_IN_TASK=y + +# +# General setup +# +CONFIG_INIT_ENV_ARG_LIMIT=32 +# CONFIG_COMPILE_TEST is not set +# CONFIG_WERROR is not set +CONFIG_LOCALVERSION="" +# CONFIG_LOCALVERSION_AUTO is not set +CONFIG_BUILD_SALT="" +CONFIG_DEFAULT_INIT="" +CONFIG_DEFAULT_HOSTNAME="@DISTRONAME@" +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +CONFIG_SYSVIPC_COMPAT=y +CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y +# CONFIG_WATCH_QUEUE is not set +CONFIG_CROSS_MEMORY_ATTACH=y +# CONFIG_USELIB is not set +# CONFIG_AUDIT is not set +CONFIG_HAVE_ARCH_AUDITSYSCALL=y + +# +# IRQ subsystem +# +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_IRQ_SHOW=y +CONFIG_GENERIC_IRQ_SHOW_LEVEL=y +CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y +CONFIG_HARDIRQS_SW_RESEND=y +CONFIG_GENERIC_IRQ_CHIP=y +CONFIG_IRQ_DOMAIN=y +CONFIG_IRQ_DOMAIN_HIERARCHY=y +CONFIG_GENERIC_IRQ_IPI=y +CONFIG_GENERIC_MSI_IRQ=y +CONFIG_GENERIC_MSI_IRQ_DOMAIN=y +CONFIG_IRQ_MSI_IOMMU=y +CONFIG_IRQ_FORCED_THREADING=y +CONFIG_SPARSE_IRQ=y +# CONFIG_GENERIC_IRQ_DEBUGFS is not set +# end of IRQ subsystem + +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_ARCH_HAS_TICK_BROADCAST=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y +CONFIG_HAVE_POSIX_CPU_TIMERS_TASK_WORK=y +CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y +CONFIG_CONTEXT_TRACKING=y +CONFIG_CONTEXT_TRACKING_IDLE=y + +# +# Timers subsystem +# +CONFIG_TICK_ONESHOT=y +CONFIG_NO_HZ_COMMON=y +# CONFIG_HZ_PERIODIC is not set +CONFIG_NO_HZ_IDLE=y +# CONFIG_NO_HZ_FULL is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +# end of Timers subsystem + +CONFIG_BPF=y +CONFIG_HAVE_EBPF_JIT=y +CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y + +# +# BPF subsystem +# +CONFIG_BPF_SYSCALL=y +# CONFIG_BPF_JIT is not set +# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set +# CONFIG_BPF_PRELOAD is not set +# end of BPF subsystem + +CONFIG_PREEMPT_VOLUNTARY_BUILD=y +# CONFIG_PREEMPT_NONE is not set +CONFIG_PREEMPT_VOLUNTARY=y +# CONFIG_PREEMPT is not set +# CONFIG_PREEMPT_DYNAMIC is not set + +# +# CPU/Task time and stats accounting +# +CONFIG_TICK_CPU_ACCOUNTING=y +# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set +# CONFIG_IRQ_TIME_ACCOUNTING is not set +# CONFIG_BSD_PROCESS_ACCT is not set +# CONFIG_TASKSTATS is not set +# CONFIG_PSI is not set +# end of CPU/Task time and stats accounting + +CONFIG_CPU_ISOLATION=y + +# +# RCU Subsystem +# +CONFIG_TREE_RCU=y +# CONFIG_RCU_EXPERT is not set +CONFIG_SRCU=y +CONFIG_TREE_SRCU=y +CONFIG_TASKS_RCU_GENERIC=y +CONFIG_TASKS_RUDE_RCU=y +CONFIG_TASKS_TRACE_RCU=y +CONFIG_RCU_STALL_COMMON=y +CONFIG_RCU_NEED_SEGCBLIST=y +# end of RCU Subsystem + +CONFIG_IKCONFIG=m +CONFIG_IKCONFIG_PROC=y +# CONFIG_IKHEADERS is not set +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 +# CONFIG_PRINTK_INDEX is not set +CONFIG_GENERIC_SCHED_CLOCK=y + +# +# Scheduler features +# +# end of Scheduler features + +CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y +CONFIG_CC_HAS_INT128=y +CONFIG_CC_IMPLICIT_FALLTHROUGH="-Wimplicit-fallthrough=5" +CONFIG_GCC11_NO_ARRAY_BOUNDS=y +CONFIG_CC_NO_ARRAY_BOUNDS=y +CONFIG_ARCH_SUPPORTS_INT128=y +CONFIG_CGROUPS=y +CONFIG_PAGE_COUNTER=y +# CONFIG_CGROUP_FAVOR_DYNMODS is not set +CONFIG_MEMCG=y +CONFIG_MEMCG_KMEM=y +CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_WRITEBACK=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +# CONFIG_RT_GROUP_SCHED is not set +CONFIG_CGROUP_PIDS=y +# CONFIG_CGROUP_RDMA is not set +CONFIG_CGROUP_FREEZER=y +CONFIG_CPUSETS=y +CONFIG_PROC_PID_CPUSET=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_BPF=y +# CONFIG_CGROUP_MISC is not set +# CONFIG_CGROUP_DEBUG is not set +CONFIG_SOCK_CGROUP_DATA=y +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_TIME_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +# CONFIG_CHECKPOINT_RESTORE is not set +# CONFIG_SCHED_AUTOGROUP is not set +# CONFIG_SYSFS_DEPRECATED is not set +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="@INITRAMFS_SOURCE@" +CONFIG_INITRAMFS_ROOT_UID=0 +CONFIG_INITRAMFS_ROOT_GID=0 +# CONFIG_RD_GZIP is not set +# CONFIG_RD_BZIP2 is not set +# CONFIG_RD_LZMA is not set +# CONFIG_RD_XZ is not set +# CONFIG_RD_LZO is not set +# CONFIG_RD_LZ4 is not set +# CONFIG_RD_ZSTD is not set +CONFIG_INITRAMFS_COMPRESSION_NONE=y +CONFIG_BOOT_CONFIG=y +# CONFIG_BOOT_CONFIG_EMBED is not set +CONFIG_INITRAMFS_PRESERVE_MTIME=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +CONFIG_LD_ORPHAN_WARN=y +CONFIG_SYSCTL=y +CONFIG_HAVE_UID16=y +CONFIG_SYSCTL_EXCEPTION_TRACE=y +CONFIG_EXPERT=y +CONFIG_UID16=y +CONFIG_MULTIUSER=y +# CONFIG_SGETMASK_SYSCALL is not set +# CONFIG_SYSFS_SYSCALL is not set +CONFIG_FHANDLE=y +CONFIG_POSIX_TIMERS=y +CONFIG_PRINTK=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_FUTEX_PI=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_IO_URING=y +CONFIG_ADVISE_SYSCALLS=y +CONFIG_MEMBARRIER=y +CONFIG_KALLSYMS=y +# CONFIG_KALLSYMS_ALL is not set +CONFIG_KALLSYMS_BASE_RELATIVE=y +CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y +CONFIG_KCMP=y +CONFIG_RSEQ=y +# CONFIG_DEBUG_RSEQ is not set +CONFIG_EMBEDDED=y +CONFIG_HAVE_PERF_EVENTS=y +# CONFIG_PC104 is not set + +# +# Kernel Performance Events And Counters +# +CONFIG_PERF_EVENTS=y +# CONFIG_DEBUG_PERF_USE_VMALLOC is not set +# end of Kernel Performance Events And Counters + +CONFIG_SYSTEM_DATA_VERIFICATION=y +CONFIG_PROFILING=y +CONFIG_TRACEPOINTS=y +# end of General setup + +CONFIG_ARM64=y +CONFIG_GCC_SUPPORTS_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_64BIT=y +CONFIG_MMU=y +CONFIG_ARM64_PAGE_SHIFT=12 +CONFIG_ARM64_CONT_PTE_SHIFT=4 +CONFIG_ARM64_CONT_PMD_SHIFT=4 +CONFIG_ARCH_MMAP_RND_BITS_MIN=18 +CONFIG_ARCH_MMAP_RND_BITS_MAX=24 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=11 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y +CONFIG_GENERIC_HWEIGHT=y +CONFIG_GENERIC_CSUM=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE=y +CONFIG_SMP=y +CONFIG_KERNEL_MODE_NEON=y +CONFIG_FIX_EARLYCON_MEM=y +CONFIG_PGTABLE_LEVELS=3 +CONFIG_ARCH_SUPPORTS_UPROBES=y +CONFIG_ARCH_PROC_KCORE_TEXT=y + +# +# Platform selection +# +# CONFIG_ARCH_ACTIONS is not set +# CONFIG_ARCH_SUNXI is not set +# CONFIG_ARCH_ALPINE is not set +# CONFIG_ARCH_APPLE is not set +CONFIG_ARCH_BCM=y +CONFIG_ARCH_BCM2835=y +# CONFIG_ARCH_BCM_IPROC is not set +# CONFIG_ARCH_BCMBCA is not set +CONFIG_ARCH_BRCMSTB=y +# CONFIG_ARCH_BERLIN is not set +# CONFIG_ARCH_BITMAIN is not set +# CONFIG_ARCH_EXYNOS is not set +# CONFIG_ARCH_SPARX5 is not set +# CONFIG_ARCH_K3 is not set +# CONFIG_ARCH_LG1K is not set +# CONFIG_ARCH_HISI is not set +# CONFIG_ARCH_KEEMBAY is not set +# CONFIG_ARCH_MEDIATEK is not set +# CONFIG_ARCH_MESON is not set +# CONFIG_ARCH_MVEBU is not set +# CONFIG_ARCH_NXP is not set +# CONFIG_ARCH_NPCM is not set +# CONFIG_ARCH_QCOM is not set +# CONFIG_ARCH_REALTEK is not set +# CONFIG_ARCH_RENESAS is not set +# CONFIG_ARCH_ROCKCHIP is not set +# CONFIG_ARCH_SEATTLE is not set +# CONFIG_ARCH_INTEL_SOCFPGA is not set +# CONFIG_ARCH_SYNQUACER is not set +# CONFIG_ARCH_TEGRA is not set +# CONFIG_ARCH_SPRD is not set +# CONFIG_ARCH_THUNDER is not set +# CONFIG_ARCH_THUNDER2 is not set +# CONFIG_ARCH_UNIPHIER is not set +# CONFIG_ARCH_VEXPRESS is not set +# CONFIG_ARCH_VISCONTI is not set +# CONFIG_ARCH_XGENE is not set +# CONFIG_ARCH_ZYNQMP is not set +# end of Platform selection + +# +# Kernel Features +# + +# +# ARM errata workarounds via the alternatives framework +# +CONFIG_AMPERE_ERRATUM_AC03_CPU_38=y +CONFIG_ARM64_WORKAROUND_CLEAN_CACHE=y +CONFIG_ARM64_ERRATUM_826319=y +CONFIG_ARM64_ERRATUM_827319=y +CONFIG_ARM64_ERRATUM_824069=y +CONFIG_ARM64_ERRATUM_819472=y +CONFIG_ARM64_ERRATUM_832075=y +CONFIG_ARM64_ERRATUM_1742098=y +CONFIG_ARM64_ERRATUM_845719=y +CONFIG_ARM64_ERRATUM_843419=y +CONFIG_ARM64_LD_HAS_FIX_ERRATUM_843419=y +CONFIG_ARM64_ERRATUM_1024718=y +CONFIG_ARM64_ERRATUM_1418040=y +CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT=y +CONFIG_ARM64_ERRATUM_1165522=y +CONFIG_ARM64_ERRATUM_1319367=y +CONFIG_ARM64_ERRATUM_1530923=y +CONFIG_ARM64_WORKAROUND_REPEAT_TLBI=y +CONFIG_ARM64_ERRATUM_2441007=y +CONFIG_ARM64_ERRATUM_1286807=y +CONFIG_ARM64_ERRATUM_1463225=y +CONFIG_ARM64_ERRATUM_1542419=y +CONFIG_ARM64_ERRATUM_1508412=y +CONFIG_ARM64_ERRATUM_2051678=y +CONFIG_ARM64_ERRATUM_2077057=y +CONFIG_ARM64_ERRATUM_2658417=y +CONFIG_ARM64_WORKAROUND_TSB_FLUSH_FAILURE=y +CONFIG_ARM64_ERRATUM_2054223=y +CONFIG_ARM64_ERRATUM_2067961=y +CONFIG_ARM64_ERRATUM_2441009=y +CONFIG_ARM64_ERRATUM_2457168=y +CONFIG_ARM64_ERRATUM_2966298=y +CONFIG_CAVIUM_ERRATUM_22375=y +CONFIG_CAVIUM_ERRATUM_23154=y +CONFIG_CAVIUM_ERRATUM_27456=y +CONFIG_CAVIUM_ERRATUM_30115=y +CONFIG_CAVIUM_TX2_ERRATUM_219=y +CONFIG_FUJITSU_ERRATUM_010001=y +CONFIG_HISILICON_ERRATUM_161600802=y +CONFIG_QCOM_FALKOR_ERRATUM_1003=y +CONFIG_QCOM_FALKOR_ERRATUM_1009=y +CONFIG_QCOM_QDF2400_ERRATUM_0065=y +CONFIG_QCOM_FALKOR_ERRATUM_E1041=y +CONFIG_NVIDIA_CARMEL_CNP_ERRATUM=y +CONFIG_SOCIONEXT_SYNQUACER_PREITS=y +# end of ARM errata workarounds via the alternatives framework + +CONFIG_ARM64_4K_PAGES=y +# CONFIG_ARM64_16K_PAGES is not set +# CONFIG_ARM64_64K_PAGES is not set +CONFIG_ARM64_VA_BITS_39=y +# CONFIG_ARM64_VA_BITS_48 is not set +CONFIG_ARM64_VA_BITS=39 +CONFIG_ARM64_PA_BITS_48=y +CONFIG_ARM64_PA_BITS=48 +# CONFIG_CPU_BIG_ENDIAN is not set +CONFIG_CPU_LITTLE_ENDIAN=y +# CONFIG_SCHED_MC is not set +# CONFIG_SCHED_CLUSTER is not set +# CONFIG_SCHED_SMT is not set +CONFIG_NR_CPUS=256 +# CONFIG_HOTPLUG_CPU is not set +# CONFIG_NUMA is not set +# CONFIG_HZ_100 is not set +# CONFIG_HZ_250 is not set +CONFIG_HZ_300=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=300 +CONFIG_SCHED_HRTICK=y +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_HW_PERF_EVENTS=y +CONFIG_CC_HAVE_SHADOW_CALL_STACK=y +# CONFIG_PARAVIRT is not set +# CONFIG_PARAVIRT_TIME_ACCOUNTING is not set +# CONFIG_KEXEC_FILE is not set +# CONFIG_CRASH_DUMP is not set +# CONFIG_XEN is not set +CONFIG_ARCH_FORCE_MAX_ORDER=11 +CONFIG_UNMAP_KERNEL_AT_EL0=y +CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY=y +CONFIG_RODATA_FULL_DEFAULT_ENABLED=y +# CONFIG_ARM64_SW_TTBR0_PAN is not set +CONFIG_ARM64_TAGGED_ADDR_ABI=y +CONFIG_COMPAT=y +CONFIG_KUSER_HELPERS=y +# CONFIG_COMPAT_ALIGNMENT_FIXUPS is not set +CONFIG_ARMV8_DEPRECATED=y +CONFIG_SWP_EMULATION=y +CONFIG_CP15_BARRIER_EMULATION=y +CONFIG_SETEND_EMULATION=y + +# +# ARMv8.1 architectural features +# +CONFIG_ARM64_HW_AFDBM=y +CONFIG_ARM64_PAN=y +CONFIG_AS_HAS_LDAPR=y +CONFIG_AS_HAS_LSE_ATOMICS=y +CONFIG_ARM64_LSE_ATOMICS=y +CONFIG_ARM64_USE_LSE_ATOMICS=y +# end of ARMv8.1 architectural features + +# +# ARMv8.2 architectural features +# +CONFIG_AS_HAS_ARMV8_2=y +CONFIG_AS_HAS_SHA3=y +# CONFIG_ARM64_PMEM is not set +CONFIG_ARM64_RAS_EXTN=y +CONFIG_ARM64_CNP=y +# end of ARMv8.2 architectural features + +# +# ARMv8.3 architectural features +# +CONFIG_ARM64_PTR_AUTH=y +CONFIG_ARM64_PTR_AUTH_KERNEL=y +CONFIG_CC_HAS_BRANCH_PROT_PAC_RET=y +CONFIG_CC_HAS_SIGN_RETURN_ADDRESS=y +CONFIG_AS_HAS_PAC=y +CONFIG_AS_HAS_CFI_NEGATE_RA_STATE=y +# end of ARMv8.3 architectural features + +# +# ARMv8.4 architectural features +# +CONFIG_ARM64_AMU_EXTN=y +CONFIG_AS_HAS_ARMV8_4=y +CONFIG_ARM64_TLB_RANGE=y +# end of ARMv8.4 architectural features + +# +# ARMv8.5 architectural features +# +CONFIG_AS_HAS_ARMV8_5=y +CONFIG_ARM64_BTI=y +CONFIG_CC_HAS_BRANCH_PROT_PAC_RET_BTI=y +CONFIG_ARM64_E0PD=y +CONFIG_ARM64_AS_HAS_MTE=y +CONFIG_ARM64_MTE=y +# end of ARMv8.5 architectural features + +# +# ARMv8.7 architectural features +# +CONFIG_ARM64_EPAN=y +# end of ARMv8.7 architectural features + +CONFIG_ARM64_SVE=y +CONFIG_ARM64_SME=y +CONFIG_ARM64_MODULE_PLTS=y +# CONFIG_ARM64_PSEUDO_NMI is not set +CONFIG_RELOCATABLE=y +# CONFIG_RANDOMIZE_BASE is not set +CONFIG_CC_HAVE_STACKPROTECTOR_SYSREG=y +CONFIG_STACKPROTECTOR_PER_TASK=y +CONFIG_ARCH_NR_GPIO=0 +# end of Kernel Features + +# +# Boot options +# +CONFIG_CMDLINE="" +CONFIG_EFI_STUB=y +CONFIG_EFI=y +CONFIG_DMI=y +# end of Boot options + +# +# Power management options +# +# CONFIG_SUSPEND is not set +# CONFIG_HIBERNATION is not set +CONFIG_PM=y +# CONFIG_PM_DEBUG is not set +CONFIG_PM_CLK=y +CONFIG_PM_GENERIC_DOMAINS=y +# CONFIG_WQ_POWER_EFFICIENT_DEFAULT is not set +CONFIG_PM_GENERIC_DOMAINS_OF=y +CONFIG_CPU_PM=y +# CONFIG_ENERGY_MODEL is not set +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +# end of Power management options + +# +# CPU Power Management +# + +# +# CPU Idle +# +CONFIG_CPU_IDLE=y +# CONFIG_CPU_IDLE_GOV_LADDER is not set +CONFIG_CPU_IDLE_GOV_MENU=y +# CONFIG_CPU_IDLE_GOV_TEO is not set + +# +# ARM CPU Idle Drivers +# +# CONFIG_ARM_PSCI_CPUIDLE is not set +# end of ARM CPU Idle Drivers +# end of CPU Idle + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_GOV_ATTR_SET=y +CONFIG_CPU_FREQ_GOV_COMMON=y +CONFIG_CPU_FREQ_STAT=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +# CONFIG_CPU_FREQ_GOV_USERSPACE is not set +CONFIG_CPU_FREQ_GOV_ONDEMAND=y +# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set +# CONFIG_CPU_FREQ_GOV_SCHEDUTIL is not set + +# +# CPU frequency scaling drivers +# +CONFIG_CPUFREQ_DT=y +CONFIG_CPUFREQ_DT_PLATDEV=y +CONFIG_ARM_BRCMSTB_AVS_CPUFREQ=y +CONFIG_ARM_RASPBERRYPI_CPUFREQ=y +# end of CPU Frequency scaling +# end of CPU Power Management + +CONFIG_ARCH_SUPPORTS_ACPI=y +# CONFIG_ACPI is not set +CONFIG_HAVE_KVM=y +# CONFIG_VIRTUALIZATION is not set + +# +# General architecture-dependent options +# +CONFIG_ARCH_HAS_SUBPAGE_FAULTS=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +# CONFIG_STATIC_KEYS_SELFTEST is not set +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_KRETPROBES=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE=y +CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y +CONFIG_HAVE_NMI=y +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +CONFIG_TRACE_IRQFLAGS_NMI_SUPPORT=y +CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_CONTIGUOUS=y +CONFIG_GENERIC_SMP_IDLE_THREAD=y +CONFIG_GENERIC_IDLE_POLL_SETUP=y +CONFIG_ARCH_HAS_FORTIFY_SOURCE=y +CONFIG_ARCH_HAS_KEEPINITRD=y +CONFIG_ARCH_HAS_SET_MEMORY=y +CONFIG_ARCH_HAS_SET_DIRECT_MAP=y +CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y +CONFIG_ARCH_WANTS_NO_INSTR=y +CONFIG_HAVE_ASM_MODVERSIONS=y +CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y +CONFIG_HAVE_RSEQ=y +CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y +CONFIG_HAVE_HW_BREAKPOINT=y +CONFIG_HAVE_PERF_REGS=y +CONFIG_HAVE_PERF_USER_STACK_DUMP=y +CONFIG_HAVE_ARCH_JUMP_LABEL=y +CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y +CONFIG_MMU_GATHER_TABLE_FREE=y +CONFIG_MMU_GATHER_RCU_TABLE_FREE=y +CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y +CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y +CONFIG_HAVE_CMPXCHG_LOCAL=y +CONFIG_HAVE_CMPXCHG_DOUBLE=y +CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y +CONFIG_HAVE_ARCH_SECCOMP=y +CONFIG_HAVE_ARCH_SECCOMP_FILTER=y +CONFIG_SECCOMP=y +CONFIG_SECCOMP_FILTER=y +# CONFIG_SECCOMP_CACHE_DEBUG is not set +CONFIG_HAVE_ARCH_STACKLEAK=y +CONFIG_HAVE_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR_STRONG=y +CONFIG_ARCH_SUPPORTS_SHADOW_CALL_STACK=y +# CONFIG_SHADOW_CALL_STACK is not set +CONFIG_ARCH_SUPPORTS_LTO_CLANG=y +CONFIG_ARCH_SUPPORTS_LTO_CLANG_THIN=y +CONFIG_LTO_NONE=y +CONFIG_ARCH_SUPPORTS_CFI_CLANG=y +CONFIG_HAVE_CONTEXT_TRACKING_USER=y +CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y +CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_MOVE_PUD=y +CONFIG_HAVE_MOVE_PMD=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y +CONFIG_HAVE_ARCH_HUGE_VMAP=y +CONFIG_HAVE_ARCH_HUGE_VMALLOC=y +CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y +CONFIG_HAVE_MOD_ARCH_SPECIFIC=y +CONFIG_MODULES_USE_ELF_RELA=y +CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK=y +CONFIG_SOFTIRQ_ON_OWN_STACK=y +CONFIG_ARCH_HAS_ELF_RANDOMIZE=y +CONFIG_HAVE_ARCH_MMAP_RND_BITS=y +CONFIG_ARCH_MMAP_RND_BITS=18 +CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=11 +CONFIG_PAGE_SIZE_LESS_THAN_64KB=y +CONFIG_PAGE_SIZE_LESS_THAN_256KB=y +CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT=y +CONFIG_CLONE_BACKWARDS=y +CONFIG_OLD_SIGSUSPEND3=y +CONFIG_COMPAT_OLD_SIGACTION=y +CONFIG_COMPAT_32BIT_TIME=y +CONFIG_HAVE_ARCH_VMAP_STACK=y +CONFIG_VMAP_STACK=y +CONFIG_HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET=y +CONFIG_RANDOMIZE_KSTACK_OFFSET=y +# CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT is not set +CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y +CONFIG_STRICT_MODULE_RWX=y +CONFIG_HAVE_ARCH_COMPILER_H=y +CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y +CONFIG_ARCH_USE_MEMREMAP_PROT=y +# CONFIG_LOCK_EVENT_COUNTS is not set +CONFIG_ARCH_HAS_RELR=y +CONFIG_HAVE_PREEMPT_DYNAMIC=y +CONFIG_HAVE_PREEMPT_DYNAMIC_KEY=y +CONFIG_ARCH_WANT_LD_ORPHAN_WARN=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y +CONFIG_ARCH_SUPPORTS_PAGE_TABLE_CHECK=y +CONFIG_ARCH_HAVE_TRACE_MMIO_ACCESS=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y +# end of GCOV-based kernel profiling + +CONFIG_HAVE_GCC_PLUGINS=y +CONFIG_GCC_PLUGINS=y +# CONFIG_GCC_PLUGIN_LATENT_ENTROPY is not set +# end of General architecture-dependent options + +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +CONFIG_MODULES=y +# CONFIG_MODULE_FORCE_LOAD is not set +CONFIG_MODULE_UNLOAD=y +# CONFIG_MODULE_FORCE_UNLOAD is not set +# CONFIG_MODULE_UNLOAD_TAINT_TRACKING is not set +# CONFIG_MODVERSIONS is not set +# CONFIG_MODULE_SRCVERSION_ALL is not set +# CONFIG_MODULE_SIG is not set +CONFIG_MODULE_COMPRESS_NONE=y +# CONFIG_MODULE_COMPRESS_GZIP is not set +# CONFIG_MODULE_COMPRESS_XZ is not set +# CONFIG_MODULE_COMPRESS_ZSTD is not set +# CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS is not set +CONFIG_MODPROBE_PATH="/sbin/modprobe" +# CONFIG_TRIM_UNUSED_KSYMS is not set +CONFIG_MODULES_TREE_LOOKUP=y +CONFIG_BLOCK=y +CONFIG_BLOCK_LEGACY_AUTOLOAD=y +CONFIG_BLK_CGROUP_RWSTAT=y +CONFIG_BLK_DEV_BSG_COMMON=y +CONFIG_BLK_ICQ=y +CONFIG_BLK_DEV_BSGLIB=y +# CONFIG_BLK_DEV_INTEGRITY is not set +# CONFIG_BLK_DEV_ZONED is not set +CONFIG_BLK_DEV_THROTTLING=y +# CONFIG_BLK_DEV_THROTTLING_LOW is not set +# CONFIG_BLK_WBT is not set +# CONFIG_BLK_CGROUP_IOLATENCY is not set +# CONFIG_BLK_CGROUP_IOCOST is not set +# CONFIG_BLK_CGROUP_IOPRIO is not set +CONFIG_BLK_DEBUG_FS=y +# CONFIG_BLK_SED_OPAL is not set +# CONFIG_BLK_INLINE_ENCRYPTION is not set + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +# CONFIG_AIX_PARTITION is not set +# CONFIG_OSF_PARTITION is not set +# CONFIG_AMIGA_PARTITION is not set +# CONFIG_ATARI_PARTITION is not set +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +# CONFIG_BSD_DISKLABEL is not set +# CONFIG_MINIX_SUBPARTITION is not set +# CONFIG_SOLARIS_X86_PARTITION is not set +# CONFIG_UNIXWARE_DISKLABEL is not set +CONFIG_LDM_PARTITION=y +# CONFIG_LDM_DEBUG is not set +# CONFIG_SGI_PARTITION is not set +# CONFIG_ULTRIX_PARTITION is not set +# CONFIG_SUN_PARTITION is not set +# CONFIG_KARMA_PARTITION is not set +CONFIG_EFI_PARTITION=y +# CONFIG_SYSV68_PARTITION is not set +# CONFIG_CMDLINE_PARTITION is not set +# end of Partition Types + +CONFIG_BLOCK_COMPAT=y +CONFIG_BLK_MQ_PCI=y +CONFIG_BLK_PM=y +CONFIG_BLOCK_HOLDER_DEPRECATED=y +CONFIG_BLK_MQ_STACKING=y + +# +# IO Schedulers +# +CONFIG_MQ_IOSCHED_DEADLINE=y +CONFIG_MQ_IOSCHED_KYBER=y +CONFIG_IOSCHED_BFQ=y +# CONFIG_BFQ_GROUP_IOSCHED is not set +# end of IO Schedulers + +CONFIG_ASN1=y +CONFIG_ARCH_INLINE_SPIN_TRYLOCK=y +CONFIG_ARCH_INLINE_SPIN_TRYLOCK_BH=y +CONFIG_ARCH_INLINE_SPIN_LOCK=y +CONFIG_ARCH_INLINE_SPIN_LOCK_BH=y +CONFIG_ARCH_INLINE_SPIN_LOCK_IRQ=y +CONFIG_ARCH_INLINE_SPIN_LOCK_IRQSAVE=y +CONFIG_ARCH_INLINE_SPIN_UNLOCK=y +CONFIG_ARCH_INLINE_SPIN_UNLOCK_BH=y +CONFIG_ARCH_INLINE_SPIN_UNLOCK_IRQ=y +CONFIG_ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE=y +CONFIG_ARCH_INLINE_READ_LOCK=y +CONFIG_ARCH_INLINE_READ_LOCK_BH=y +CONFIG_ARCH_INLINE_READ_LOCK_IRQ=y +CONFIG_ARCH_INLINE_READ_LOCK_IRQSAVE=y +CONFIG_ARCH_INLINE_READ_UNLOCK=y +CONFIG_ARCH_INLINE_READ_UNLOCK_BH=y +CONFIG_ARCH_INLINE_READ_UNLOCK_IRQ=y +CONFIG_ARCH_INLINE_READ_UNLOCK_IRQRESTORE=y +CONFIG_ARCH_INLINE_WRITE_LOCK=y +CONFIG_ARCH_INLINE_WRITE_LOCK_BH=y +CONFIG_ARCH_INLINE_WRITE_LOCK_IRQ=y +CONFIG_ARCH_INLINE_WRITE_LOCK_IRQSAVE=y +CONFIG_ARCH_INLINE_WRITE_UNLOCK=y +CONFIG_ARCH_INLINE_WRITE_UNLOCK_BH=y +CONFIG_ARCH_INLINE_WRITE_UNLOCK_IRQ=y +CONFIG_ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE=y +CONFIG_INLINE_SPIN_TRYLOCK=y +CONFIG_INLINE_SPIN_TRYLOCK_BH=y +CONFIG_INLINE_SPIN_LOCK=y +CONFIG_INLINE_SPIN_LOCK_BH=y +CONFIG_INLINE_SPIN_LOCK_IRQ=y +CONFIG_INLINE_SPIN_LOCK_IRQSAVE=y +CONFIG_INLINE_SPIN_UNLOCK_BH=y +CONFIG_INLINE_SPIN_UNLOCK_IRQ=y +CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE=y +CONFIG_INLINE_READ_LOCK=y +CONFIG_INLINE_READ_LOCK_BH=y +CONFIG_INLINE_READ_LOCK_IRQ=y +CONFIG_INLINE_READ_LOCK_IRQSAVE=y +CONFIG_INLINE_READ_UNLOCK=y +CONFIG_INLINE_READ_UNLOCK_BH=y +CONFIG_INLINE_READ_UNLOCK_IRQ=y +CONFIG_INLINE_READ_UNLOCK_IRQRESTORE=y +CONFIG_INLINE_WRITE_LOCK=y +CONFIG_INLINE_WRITE_LOCK_BH=y +CONFIG_INLINE_WRITE_LOCK_IRQ=y +CONFIG_INLINE_WRITE_LOCK_IRQSAVE=y +CONFIG_INLINE_WRITE_UNLOCK=y +CONFIG_INLINE_WRITE_UNLOCK_BH=y +CONFIG_INLINE_WRITE_UNLOCK_IRQ=y +CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE=y +CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y +CONFIG_MUTEX_SPIN_ON_OWNER=y +CONFIG_RWSEM_SPIN_ON_OWNER=y +CONFIG_LOCK_SPIN_ON_OWNER=y +CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y +CONFIG_QUEUED_SPINLOCKS=y +CONFIG_ARCH_USE_QUEUED_RWLOCKS=y +CONFIG_QUEUED_RWLOCKS=y +CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y +CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y +CONFIG_FREEZER=y + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +CONFIG_COMPAT_BINFMT_ELF=y +CONFIG_ARCH_BINFMT_ELF_STATE=y +CONFIG_ARCH_BINFMT_ELF_EXTRA_PHDRS=y +CONFIG_ARCH_HAVE_ELF_PROT=y +CONFIG_ARCH_USE_GNU_PROPERTY=y +CONFIG_ELFCORE=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y +CONFIG_BINFMT_SCRIPT=y +# CONFIG_BINFMT_MISC is not set +CONFIG_COREDUMP=y +# end of Executable file formats + +# +# Memory Management options +# +CONFIG_SWAP=y +# CONFIG_ZSWAP is not set + +# +# SLAB allocator options +# +# CONFIG_SLAB is not set +CONFIG_SLUB=y +# CONFIG_SLOB is not set +CONFIG_SLAB_MERGE_DEFAULT=y +# CONFIG_SLAB_FREELIST_RANDOM is not set +# CONFIG_SLAB_FREELIST_HARDENED is not set +# CONFIG_SLUB_STATS is not set +CONFIG_SLUB_CPU_PARTIAL=y +# end of SLAB allocator options + +# CONFIG_SHUFFLE_PAGE_ALLOCATOR is not set +# CONFIG_COMPAT_BRK is not set +CONFIG_SPARSEMEM=y +CONFIG_SPARSEMEM_EXTREME=y +CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y +CONFIG_SPARSEMEM_VMEMMAP=y +CONFIG_HAVE_FAST_GUP=y +CONFIG_ARCH_KEEP_MEMBLOCK=y +CONFIG_MEMORY_ISOLATION=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +# CONFIG_MEMORY_HOTPLUG is not set +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y +CONFIG_COMPACTION=y +CONFIG_COMPACT_UNEVICTABLE_DEFAULT=1 +# CONFIG_PAGE_REPORTING is not set +CONFIG_MIGRATION=y +CONFIG_CONTIG_ALLOC=y +CONFIG_PHYS_ADDR_T_64BIT=y +# CONFIG_KSM is not set +CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +# CONFIG_MEMORY_FAILURE is not set +CONFIG_ARCH_WANTS_THP_SWAP=y +# CONFIG_TRANSPARENT_HUGEPAGE is not set +CONFIG_CMA=y +# CONFIG_CMA_DEBUG is not set +CONFIG_CMA_DEBUGFS=y +# CONFIG_CMA_SYSFS is not set +CONFIG_CMA_AREAS=7 +CONFIG_GENERIC_EARLY_IOREMAP=y +# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set +# CONFIG_IDLE_PAGE_TRACKING is not set +CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y +CONFIG_ARCH_HAS_CURRENT_STACK_POINTER=y +CONFIG_ARCH_HAS_PTE_DEVMAP=y +CONFIG_ARCH_HAS_ZONE_DMA_SET=y +CONFIG_ZONE_DMA=y +CONFIG_ZONE_DMA32=y +CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y +CONFIG_VM_EVENT_COUNTERS=y +# CONFIG_PERCPU_STATS is not set +# CONFIG_GUP_TEST is not set +CONFIG_ARCH_HAS_PTE_SPECIAL=y +# CONFIG_ANON_VMA_NAME is not set +# CONFIG_USERFAULTFD is not set +# CONFIG_LRU_GEN is not set +CONFIG_LOCK_MM_AND_FIND_VMA=y + +# +# Data Access Monitoring +# +# CONFIG_DAMON is not set +# end of Data Access Monitoring +# end of Memory Management options + +CONFIG_NET=y +CONFIG_COMPAT_NETLINK_MESSAGES=y +CONFIG_NET_EGRESS=y +CONFIG_SKB_EXTENSIONS=y + +# +# Networking options +# +CONFIG_PACKET=y +# CONFIG_PACKET_DIAG is not set +CONFIG_UNIX=y +CONFIG_UNIX_SCM=y +CONFIG_AF_UNIX_OOB=y +# CONFIG_UNIX_DIAG is not set +# CONFIG_TLS is not set +CONFIG_XFRM=y +CONFIG_XFRM_ALGO=y +CONFIG_XFRM_USER=y +# CONFIG_XFRM_INTERFACE is not set +# CONFIG_XFRM_SUB_POLICY is not set +# CONFIG_XFRM_MIGRATE is not set +# CONFIG_XFRM_STATISTICS is not set +CONFIG_XFRM_ESP=y +# CONFIG_NET_KEY is not set +# CONFIG_XDP_SOCKETS is not set +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +# CONFIG_IP_FIB_TRIE_STATS is not set +CONFIG_IP_MULTIPLE_TABLES=y +# CONFIG_IP_ROUTE_MULTIPATH is not set +# CONFIG_IP_ROUTE_VERBOSE is not set +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +# CONFIG_IP_PNP_RARP is not set +# CONFIG_NET_IPIP is not set +# CONFIG_NET_IPGRE_DEMUX is not set +CONFIG_NET_IP_TUNNEL=m +CONFIG_IP_MROUTE_COMMON=y +CONFIG_IP_MROUTE=y +# CONFIG_IP_MROUTE_MULTIPLE_TABLES is not set +# CONFIG_IP_PIMSM_V1 is not set +# CONFIG_IP_PIMSM_V2 is not set +# CONFIG_SYN_COOKIES is not set +# CONFIG_NET_IPVTI is not set +CONFIG_NET_UDP_TUNNEL=m +CONFIG_NET_FOU=m +# CONFIG_NET_FOU_IP_TUNNELS is not set +# CONFIG_INET_AH is not set +CONFIG_INET_ESP=y +# CONFIG_INET_ESP_OFFLOAD is not set +# CONFIG_INET_ESPINTCP is not set +# CONFIG_INET_IPCOMP is not set +CONFIG_INET_TABLE_PERTURB_ORDER=16 +CONFIG_INET_TUNNEL=m +# CONFIG_INET_DIAG is not set +CONFIG_TCP_CONG_ADVANCED=y +# CONFIG_TCP_CONG_BIC is not set +CONFIG_TCP_CONG_CUBIC=y +# CONFIG_TCP_CONG_WESTWOOD is not set +CONFIG_TCP_CONG_HTCP=m +CONFIG_TCP_CONG_HSTCP=m +# CONFIG_TCP_CONG_HYBLA is not set +CONFIG_TCP_CONG_VEGAS=m +# CONFIG_TCP_CONG_NV is not set +CONFIG_TCP_CONG_SCALABLE=m +# CONFIG_TCP_CONG_LP is not set +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +# CONFIG_TCP_CONG_DCTCP is not set +CONFIG_TCP_CONG_CDG=m +# CONFIG_TCP_CONG_BBR is not set +CONFIG_DEFAULT_CUBIC=y +# CONFIG_DEFAULT_RENO is not set +CONFIG_DEFAULT_TCP_CONG="cubic" +# CONFIG_TCP_MD5SIG is not set +CONFIG_IPV6=y +# CONFIG_IPV6_ROUTER_PREF is not set +# CONFIG_IPV6_OPTIMISTIC_DAD is not set +# CONFIG_INET6_AH is not set +# CONFIG_INET6_ESP is not set +# CONFIG_INET6_IPCOMP is not set +# CONFIG_IPV6_MIP6 is not set +# CONFIG_IPV6_ILA is not set +# CONFIG_IPV6_VTI is not set +CONFIG_IPV6_SIT=m +# CONFIG_IPV6_SIT_6RD is not set +CONFIG_IPV6_NDISC_NODETYPE=y +# CONFIG_IPV6_TUNNEL is not set +CONFIG_IPV6_FOU=m +# CONFIG_IPV6_MULTIPLE_TABLES is not set +# CONFIG_IPV6_MROUTE is not set +# CONFIG_IPV6_SEG6_LWTUNNEL is not set +# CONFIG_IPV6_SEG6_HMAC is not set +# CONFIG_IPV6_RPL_LWTUNNEL is not set +# CONFIG_IPV6_IOAM6_LWTUNNEL is not set +# CONFIG_MPTCP is not set +# CONFIG_NETWORK_SECMARK is not set +# CONFIG_NETWORK_PHY_TIMESTAMPING is not set +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_BRIDGE_NETFILTER=m + +# +# Core Netfilter Configuration +# +# CONFIG_NETFILTER_INGRESS is not set +CONFIG_NETFILTER_EGRESS=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_FAMILY_BRIDGE=y +# CONFIG_NETFILTER_NETLINK_ACCT is not set +# CONFIG_NETFILTER_NETLINK_QUEUE is not set +CONFIG_NETFILTER_NETLINK_LOG=m +# CONFIG_NETFILTER_NETLINK_OSF is not set +CONFIG_NF_CONNTRACK=m +# CONFIG_NF_LOG_SYSLOG is not set +# CONFIG_NF_CONNTRACK_MARK is not set +# CONFIG_NF_CONNTRACK_ZONES is not set +# CONFIG_NF_CONNTRACK_PROCFS is not set +# CONFIG_NF_CONNTRACK_EVENTS is not set +# CONFIG_NF_CONNTRACK_TIMEOUT is not set +# CONFIG_NF_CONNTRACK_TIMESTAMP is not set +# CONFIG_NF_CONNTRACK_LABELS is not set +# CONFIG_NF_CT_PROTO_DCCP is not set +# CONFIG_NF_CT_PROTO_SCTP is not set +# CONFIG_NF_CT_PROTO_UDPLITE is not set +# CONFIG_NF_CONNTRACK_AMANDA is not set +CONFIG_NF_CONNTRACK_FTP=m +# CONFIG_NF_CONNTRACK_H323 is not set +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_BROADCAST=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +# CONFIG_NF_CONNTRACK_SNMP is not set +# CONFIG_NF_CONNTRACK_PPTP is not set +# CONFIG_NF_CONNTRACK_SANE is not set +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +# CONFIG_NETFILTER_NETLINK_GLUE_CT is not set +CONFIG_NF_NAT=m +CONFIG_NF_NAT_FTP=m +CONFIG_NF_NAT_IRC=m +CONFIG_NF_NAT_SIP=m +CONFIG_NF_NAT_TFTP=m +CONFIG_NF_NAT_REDIRECT=y +CONFIG_NF_NAT_MASQUERADE=y +# CONFIG_NF_TABLES is not set +CONFIG_NETFILTER_XTABLES=m +CONFIG_NETFILTER_XTABLES_COMPAT=y + +# +# Xtables combined modules +# +CONFIG_NETFILTER_XT_MARK=m +# CONFIG_NETFILTER_XT_CONNMARK is not set + +# +# Xtables targets +# +# CONFIG_NETFILTER_XT_TARGET_CHECKSUM is not set +# CONFIG_NETFILTER_XT_TARGET_CLASSIFY is not set +# CONFIG_NETFILTER_XT_TARGET_CONNMARK is not set +# CONFIG_NETFILTER_XT_TARGET_DSCP is not set +# CONFIG_NETFILTER_XT_TARGET_HL is not set +# CONFIG_NETFILTER_XT_TARGET_HMARK is not set +# CONFIG_NETFILTER_XT_TARGET_IDLETIMER is not set +# CONFIG_NETFILTER_XT_TARGET_LED is not set +# CONFIG_NETFILTER_XT_TARGET_LOG is not set +# CONFIG_NETFILTER_XT_TARGET_MARK is not set +CONFIG_NETFILTER_XT_NAT=m +# CONFIG_NETFILTER_XT_TARGET_NETMAP is not set +# CONFIG_NETFILTER_XT_TARGET_NFLOG is not set +# CONFIG_NETFILTER_XT_TARGET_NFQUEUE is not set +# CONFIG_NETFILTER_XT_TARGET_RATEEST is not set +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m +# CONFIG_NETFILTER_XT_TARGET_TEE is not set +# CONFIG_NETFILTER_XT_TARGET_TPROXY is not set +# CONFIG_NETFILTER_XT_TARGET_TCPMSS is not set +# CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP is not set + +# +# Xtables matches +# +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +# CONFIG_NETFILTER_XT_MATCH_BPF is not set +# CONFIG_NETFILTER_XT_MATCH_CGROUP is not set +# CONFIG_NETFILTER_XT_MATCH_CLUSTER is not set +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +# CONFIG_NETFILTER_XT_MATCH_CONNBYTES is not set +# CONFIG_NETFILTER_XT_MATCH_CONNLABEL is not set +# CONFIG_NETFILTER_XT_MATCH_CONNLIMIT is not set +# CONFIG_NETFILTER_XT_MATCH_CONNMARK is not set +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +# CONFIG_NETFILTER_XT_MATCH_CPU is not set +# CONFIG_NETFILTER_XT_MATCH_DCCP is not set +# CONFIG_NETFILTER_XT_MATCH_DEVGROUP is not set +# CONFIG_NETFILTER_XT_MATCH_DSCP is not set +# CONFIG_NETFILTER_XT_MATCH_ECN is not set +# CONFIG_NETFILTER_XT_MATCH_ESP is not set +# CONFIG_NETFILTER_XT_MATCH_HASHLIMIT is not set +# CONFIG_NETFILTER_XT_MATCH_HELPER is not set +# CONFIG_NETFILTER_XT_MATCH_HL is not set +# CONFIG_NETFILTER_XT_MATCH_IPCOMP is not set +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +# CONFIG_NETFILTER_XT_MATCH_L2TP is not set +# CONFIG_NETFILTER_XT_MATCH_LENGTH is not set +# CONFIG_NETFILTER_XT_MATCH_LIMIT is not set +# CONFIG_NETFILTER_XT_MATCH_MAC is not set +# CONFIG_NETFILTER_XT_MATCH_MARK is not set +# CONFIG_NETFILTER_XT_MATCH_MULTIPORT is not set +# CONFIG_NETFILTER_XT_MATCH_NFACCT is not set +# CONFIG_NETFILTER_XT_MATCH_OSF is not set +CONFIG_NETFILTER_XT_MATCH_OWNER=m +# CONFIG_NETFILTER_XT_MATCH_POLICY is not set +# CONFIG_NETFILTER_XT_MATCH_PHYSDEV is not set +# CONFIG_NETFILTER_XT_MATCH_PKTTYPE is not set +# CONFIG_NETFILTER_XT_MATCH_QUOTA is not set +# CONFIG_NETFILTER_XT_MATCH_RATEEST is not set +# CONFIG_NETFILTER_XT_MATCH_REALM is not set +# CONFIG_NETFILTER_XT_MATCH_RECENT is not set +# CONFIG_NETFILTER_XT_MATCH_SCTP is not set +# CONFIG_NETFILTER_XT_MATCH_SOCKET is not set +CONFIG_NETFILTER_XT_MATCH_STATE=m +# CONFIG_NETFILTER_XT_MATCH_STATISTIC is not set +# CONFIG_NETFILTER_XT_MATCH_STRING is not set +# CONFIG_NETFILTER_XT_MATCH_TCPMSS is not set +# CONFIG_NETFILTER_XT_MATCH_TIME is not set +# CONFIG_NETFILTER_XT_MATCH_U32 is not set +# end of Core Netfilter Configuration + +# CONFIG_IP_SET is not set +CONFIG_IP_VS=m +# CONFIG_IP_VS_IPV6 is not set +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=12 + +# +# IPVS transport protocol load balancing support +# +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +# CONFIG_IP_VS_PROTO_ESP is not set +# CONFIG_IP_VS_PROTO_AH is not set +# CONFIG_IP_VS_PROTO_SCTP is not set + +# +# IPVS scheduler +# +CONFIG_IP_VS_RR=m +# CONFIG_IP_VS_WRR is not set +# CONFIG_IP_VS_LC is not set +# CONFIG_IP_VS_WLC is not set +# CONFIG_IP_VS_FO is not set +# CONFIG_IP_VS_OVF is not set +# CONFIG_IP_VS_LBLC is not set +# CONFIG_IP_VS_LBLCR is not set +# CONFIG_IP_VS_DH is not set +# CONFIG_IP_VS_SH is not set +# CONFIG_IP_VS_MH is not set +# CONFIG_IP_VS_SED is not set +# CONFIG_IP_VS_NQ is not set +# CONFIG_IP_VS_TWOS is not set + +# +# IPVS SH scheduler +# +CONFIG_IP_VS_SH_TAB_BITS=8 + +# +# IPVS MH scheduler +# +CONFIG_IP_VS_MH_TAB_INDEX=12 + +# +# IPVS application helper +# +# CONFIG_IP_VS_FTP is not set +CONFIG_IP_VS_NFCT=y +# CONFIG_IP_VS_PE_SIP is not set + +# +# IP: Netfilter Configuration +# +CONFIG_NF_DEFRAG_IPV4=m +# CONFIG_NF_SOCKET_IPV4 is not set +# CONFIG_NF_TPROXY_IPV4 is not set +# CONFIG_NF_DUP_IPV4 is not set +# CONFIG_NF_LOG_ARP is not set +# CONFIG_NF_LOG_IPV4 is not set +CONFIG_NF_REJECT_IPV4=m +CONFIG_IP_NF_IPTABLES=m +# CONFIG_IP_NF_MATCH_AH is not set +# CONFIG_IP_NF_MATCH_ECN is not set +# CONFIG_IP_NF_MATCH_RPFILTER is not set +# CONFIG_IP_NF_MATCH_TTL is not set +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +# CONFIG_IP_NF_TARGET_SYNPROXY is not set +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +# CONFIG_IP_NF_TARGET_NETMAP is not set +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_MANGLE=m +# CONFIG_IP_NF_TARGET_CLUSTERIP is not set +# CONFIG_IP_NF_TARGET_ECN is not set +# CONFIG_IP_NF_TARGET_TTL is not set +# CONFIG_IP_NF_RAW is not set +# CONFIG_IP_NF_ARPTABLES is not set +# end of IP: Netfilter Configuration + +# +# IPv6: Netfilter Configuration +# +# CONFIG_NF_SOCKET_IPV6 is not set +# CONFIG_NF_TPROXY_IPV6 is not set +# CONFIG_NF_DUP_IPV6 is not set +CONFIG_NF_REJECT_IPV6=m +# CONFIG_NF_LOG_IPV6 is not set +CONFIG_IP6_NF_IPTABLES=m +# CONFIG_IP6_NF_MATCH_AH is not set +# CONFIG_IP6_NF_MATCH_EUI64 is not set +# CONFIG_IP6_NF_MATCH_FRAG is not set +# CONFIG_IP6_NF_MATCH_OPTS is not set +# CONFIG_IP6_NF_MATCH_HL is not set +# CONFIG_IP6_NF_MATCH_IPV6HEADER is not set +# CONFIG_IP6_NF_MATCH_MH is not set +# CONFIG_IP6_NF_MATCH_RPFILTER is not set +# CONFIG_IP6_NF_MATCH_RT is not set +# CONFIG_IP6_NF_MATCH_SRH is not set +# CONFIG_IP6_NF_TARGET_HL is not set +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +# CONFIG_IP6_NF_TARGET_SYNPROXY is not set +CONFIG_IP6_NF_MANGLE=m +# CONFIG_IP6_NF_RAW is not set +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +# CONFIG_IP6_NF_TARGET_NPT is not set +# end of IPv6: Netfilter Configuration + +CONFIG_NF_DEFRAG_IPV6=m +# CONFIG_NF_CONNTRACK_BRIDGE is not set +# CONFIG_BRIDGE_NF_EBTABLES is not set +# CONFIG_BPFILTER is not set +# CONFIG_IP_DCCP is not set +# CONFIG_IP_SCTP is not set +# CONFIG_RDS is not set +# CONFIG_TIPC is not set +# CONFIG_ATM is not set +# CONFIG_L2TP is not set +CONFIG_STP=m +CONFIG_BRIDGE=m +CONFIG_BRIDGE_IGMP_SNOOPING=y +CONFIG_BRIDGE_VLAN_FILTERING=y +# CONFIG_BRIDGE_MRP is not set +# CONFIG_BRIDGE_CFM is not set +# CONFIG_NET_DSA is not set +CONFIG_VLAN_8021Q=m +# CONFIG_VLAN_8021Q_GVRP is not set +# CONFIG_VLAN_8021Q_MVRP is not set +CONFIG_LLC=m +# CONFIG_LLC2 is not set +# CONFIG_ATALK is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_PHONET is not set +# CONFIG_6LOWPAN is not set +# CONFIG_IEEE802154 is not set +CONFIG_NET_SCHED=y + +# +# Queueing/Scheduling +# +# CONFIG_NET_SCH_CBQ is not set +# CONFIG_NET_SCH_HTB is not set +# CONFIG_NET_SCH_HFSC is not set +# CONFIG_NET_SCH_PRIO is not set +# CONFIG_NET_SCH_MULTIQ is not set +# CONFIG_NET_SCH_RED is not set +# CONFIG_NET_SCH_SFB is not set +# CONFIG_NET_SCH_SFQ is not set +# CONFIG_NET_SCH_TEQL is not set +# CONFIG_NET_SCH_TBF is not set +# CONFIG_NET_SCH_CBS is not set +# CONFIG_NET_SCH_ETF is not set +# CONFIG_NET_SCH_TAPRIO is not set +# CONFIG_NET_SCH_GRED is not set +# CONFIG_NET_SCH_DSMARK is not set +# CONFIG_NET_SCH_NETEM is not set +# CONFIG_NET_SCH_DRR is not set +# CONFIG_NET_SCH_MQPRIO is not set +# CONFIG_NET_SCH_SKBPRIO is not set +# CONFIG_NET_SCH_CHOKE is not set +# CONFIG_NET_SCH_QFQ is not set +# CONFIG_NET_SCH_CODEL is not set +CONFIG_NET_SCH_FQ_CODEL=y +# CONFIG_NET_SCH_CAKE is not set +# CONFIG_NET_SCH_FQ is not set +# CONFIG_NET_SCH_HHF is not set +# CONFIG_NET_SCH_PIE is not set +# CONFIG_NET_SCH_PLUG is not set +# CONFIG_NET_SCH_ETS is not set +# CONFIG_NET_SCH_DEFAULT is not set + +# +# Classification +# +CONFIG_NET_CLS=y +# CONFIG_NET_CLS_BASIC is not set +# CONFIG_NET_CLS_ROUTE4 is not set +# CONFIG_NET_CLS_FW is not set +# CONFIG_NET_CLS_U32 is not set +# CONFIG_NET_CLS_FLOW is not set +CONFIG_NET_CLS_CGROUP=m +# CONFIG_NET_CLS_BPF is not set +# CONFIG_NET_CLS_FLOWER is not set +# CONFIG_NET_CLS_MATCHALL is not set +# CONFIG_NET_EMATCH is not set +# CONFIG_NET_CLS_ACT is not set +CONFIG_NET_SCH_FIFO=y +# CONFIG_DCB is not set +CONFIG_DNS_RESOLVER=y +# CONFIG_BATMAN_ADV is not set +# CONFIG_OPENVSWITCH is not set +# CONFIG_VSOCKETS is not set +# CONFIG_NETLINK_DIAG is not set +# CONFIG_MPLS is not set +# CONFIG_NET_NSH is not set +# CONFIG_HSR is not set +# CONFIG_NET_SWITCHDEV is not set +CONFIG_NET_L3_MASTER_DEV=y +# CONFIG_QRTR is not set +# CONFIG_NET_NCSI is not set +CONFIG_PCPU_DEV_REFCNT=y +CONFIG_RPS=y +CONFIG_RFS_ACCEL=y +CONFIG_SOCK_RX_QUEUE_MAPPING=y +CONFIG_XPS=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CGROUP_NET_CLASSID=y +CONFIG_NET_RX_BUSY_POLL=y +CONFIG_BQL=y +# CONFIG_BPF_STREAM_PARSER is not set +CONFIG_NET_FLOW_LIMIT=y + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set +# CONFIG_NET_DROP_MONITOR is not set +# end of Network testing +# end of Networking options + +# CONFIG_HAMRADIO is not set +# CONFIG_CAN is not set +CONFIG_BT=m +CONFIG_BT_BREDR=y +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +# CONFIG_BT_BNEP is not set +CONFIG_BT_HIDP=m +CONFIG_BT_HS=y +CONFIG_BT_LE=y +# CONFIG_BT_LEDS is not set +# CONFIG_BT_MSFTEXT is not set +# CONFIG_BT_AOSPEXT is not set +# CONFIG_BT_DEBUGFS is not set +# CONFIG_BT_SELFTEST is not set + +# +# Bluetooth device drivers +# +CONFIG_BT_INTEL=m +CONFIG_BT_BCM=m +CONFIG_BT_RTL=m +CONFIG_BT_MTK=m +CONFIG_BT_HCIBTUSB=m +# CONFIG_BT_HCIBTUSB_AUTOSUSPEND is not set +CONFIG_BT_HCIBTUSB_BCM=y +CONFIG_BT_HCIBTUSB_MTK=y +CONFIG_BT_HCIBTUSB_RTL=y +# CONFIG_BT_HCIBTSDIO is not set +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_SERDEV=y +CONFIG_BT_HCIUART_H4=y +# CONFIG_BT_HCIUART_NOKIA is not set +# CONFIG_BT_HCIUART_BCSP is not set +# CONFIG_BT_HCIUART_ATH3K is not set +# CONFIG_BT_HCIUART_LL is not set +CONFIG_BT_HCIUART_3WIRE=y +# CONFIG_BT_HCIUART_INTEL is not set +CONFIG_BT_HCIUART_BCM=y +# CONFIG_BT_HCIUART_RTL is not set +# CONFIG_BT_HCIUART_QCA is not set +# CONFIG_BT_HCIUART_AG6XX is not set +# CONFIG_BT_HCIUART_MRVL is not set +CONFIG_BT_HCIBCM203X=m +# CONFIG_BT_HCIBPA10X is not set +CONFIG_BT_HCIBFUSB=m +# CONFIG_BT_HCIVHCI is not set +# CONFIG_BT_MRVL is not set +CONFIG_BT_ATH3K=m +# CONFIG_BT_MTKSDIO is not set +# CONFIG_BT_MTKUART is not set +# end of Bluetooth device drivers + +# CONFIG_AF_RXRPC is not set +# CONFIG_AF_KCM is not set +# CONFIG_MCTP is not set +CONFIG_FIB_RULES=y +CONFIG_WIRELESS=y +CONFIG_WIRELESS_EXT=y +CONFIG_WEXT_CORE=y +CONFIG_WEXT_PROC=y +CONFIG_WEXT_PRIV=y +CONFIG_CFG80211=m +# CONFIG_NL80211_TESTMODE is not set +# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set +# CONFIG_CFG80211_CERTIFICATION_ONUS is not set +CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y +CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y +CONFIG_CFG80211_DEFAULT_PS=y +# CONFIG_CFG80211_DEBUGFS is not set +# CONFIG_CFG80211_CRDA_SUPPORT is not set +CONFIG_CFG80211_WEXT=y +CONFIG_LIB80211=m +CONFIG_LIB80211_CRYPT_WEP=m +CONFIG_LIB80211_CRYPT_CCMP=m +# CONFIG_LIB80211_DEBUG is not set +CONFIG_MAC80211=m +CONFIG_MAC80211_HAS_RC=y +CONFIG_MAC80211_RC_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" +# CONFIG_MAC80211_MESH is not set +CONFIG_MAC80211_LEDS=y +# CONFIG_MAC80211_DEBUGFS is not set +# CONFIG_MAC80211_MESSAGE_TRACING is not set +# CONFIG_MAC80211_DEBUG_MENU is not set +CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 +CONFIG_RFKILL=m +CONFIG_RFKILL_LEDS=y +CONFIG_RFKILL_INPUT=y +# CONFIG_RFKILL_GPIO is not set +# CONFIG_NET_9P is not set +# CONFIG_CAIF is not set +# CONFIG_CEPH_LIB is not set +# CONFIG_NFC is not set +# CONFIG_PSAMPLE is not set +# CONFIG_NET_IFE is not set +# CONFIG_LWTUNNEL is not set +CONFIG_DST_CACHE=y +CONFIG_GRO_CELLS=y +CONFIG_NET_SELFTESTS=y +CONFIG_NET_SOCK_MSG=y +CONFIG_PAGE_POOL=y +# CONFIG_PAGE_POOL_STATS is not set +# CONFIG_FAILOVER is not set +CONFIG_ETHTOOL_NETLINK=y + +# +# Device Drivers +# +CONFIG_ARM_AMBA=y +CONFIG_HAVE_PCI=y +CONFIG_PCI=y +CONFIG_PCI_DOMAINS=y +CONFIG_PCI_DOMAINS_GENERIC=y +CONFIG_PCI_SYSCALL=y +CONFIG_PCIEPORTBUS=y +CONFIG_PCIEAER=y +# CONFIG_PCIEAER_INJECT is not set +# CONFIG_PCIE_ECRC is not set +CONFIG_PCIEASPM=y +# CONFIG_PCIEASPM_DEFAULT is not set +CONFIG_PCIEASPM_POWERSAVE=y +# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set +# CONFIG_PCIEASPM_PERFORMANCE is not set +CONFIG_PCIE_PME=y +CONFIG_PCIE_DPC=y +# CONFIG_PCIE_PTM is not set +CONFIG_PCI_MSI=y +CONFIG_PCI_MSI_IRQ_DOMAIN=y +CONFIG_PCI_QUIRKS=y +# CONFIG_PCI_DEBUG is not set +# CONFIG_PCI_STUB is not set +# CONFIG_PCI_IOV is not set +# CONFIG_PCI_PRI is not set +# CONFIG_PCI_PASID is not set +CONFIG_PCI_LABEL=y +# CONFIG_PCIE_BUS_TUNE_OFF is not set +CONFIG_PCIE_BUS_DEFAULT=y +# CONFIG_PCIE_BUS_SAFE is not set +# CONFIG_PCIE_BUS_PERFORMANCE is not set +# CONFIG_PCIE_BUS_PEER2PEER is not set +CONFIG_VGA_ARB=y +CONFIG_VGA_ARB_MAX_GPUS=16 +# CONFIG_HOTPLUG_PCI is not set + +# +# PCI controller drivers +# +# CONFIG_PCI_FTPCI100 is not set +# CONFIG_PCI_HOST_GENERIC is not set +# CONFIG_PCIE_XILINX is not set +# CONFIG_PCI_XGENE is not set +# CONFIG_PCIE_ALTERA is not set +# CONFIG_PCI_HOST_THUNDER_PEM is not set +# CONFIG_PCI_HOST_THUNDER_ECAM is not set +CONFIG_PCIE_BRCMSTB=y +# CONFIG_PCIE_MICROCHIP_HOST is not set + +# +# DesignWare PCI Core Support +# +# CONFIG_PCIE_DW_PLAT_HOST is not set +# CONFIG_PCI_HISI is not set +# CONFIG_PCIE_KIRIN is not set +# CONFIG_PCI_MESON is not set +# CONFIG_PCIE_AL is not set +# end of DesignWare PCI Core Support + +# +# Mobiveil PCIe Core Support +# +# end of Mobiveil PCIe Core Support + +# +# Cadence PCIe controllers support +# +# CONFIG_PCIE_CADENCE_PLAT_HOST is not set +# CONFIG_PCI_J721E_HOST is not set +# end of Cadence PCIe controllers support +# end of PCI controller drivers + +# +# PCI Endpoint +# +# CONFIG_PCI_ENDPOINT is not set +# end of PCI Endpoint + +# +# PCI switch controller drivers +# +# CONFIG_PCI_SW_SWITCHTEC is not set +# end of PCI switch controller drivers + +# CONFIG_CXL_BUS is not set +# CONFIG_PCCARD is not set +# CONFIG_RAPIDIO is not set + +# +# Generic Driver Options +# +# CONFIG_UEVENT_HELPER is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +# CONFIG_DEVTMPFS_SAFE is not set +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y + +# +# Firmware loader +# +CONFIG_FW_LOADER=y +CONFIG_EXTRA_FIRMWARE="" +# CONFIG_FW_LOADER_USER_HELPER is not set +# CONFIG_FW_LOADER_COMPRESS is not set +# CONFIG_FW_UPLOAD is not set +# end of Firmware loader + +CONFIG_WANT_DEV_COREDUMP=y +# CONFIG_ALLOW_DEV_COREDUMP is not set +# CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set +# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set +# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set +CONFIG_GENERIC_CPU_AUTOPROBE=y +CONFIG_GENERIC_CPU_VULNERABILITIES=y +CONFIG_SOC_BUS=y +CONFIG_REGMAP=y +CONFIG_REGMAP_I2C=y +CONFIG_REGMAP_SPI=y +CONFIG_REGMAP_MMIO=y +CONFIG_REGMAP_IRQ=y +CONFIG_DMA_SHARED_BUFFER=y +# CONFIG_DMA_FENCE_TRACE is not set +CONFIG_GENERIC_ARCH_TOPOLOGY=y +# end of Generic Driver Options + +# +# Bus devices +# +# CONFIG_BRCMSTB_GISB_ARB is not set +# CONFIG_MOXTET is not set +# CONFIG_VEXPRESS_CONFIG is not set +# CONFIG_MHI_BUS is not set +# CONFIG_MHI_BUS_EP is not set +# end of Bus devices + +# CONFIG_CONNECTOR is not set + +# +# Firmware Drivers +# + +# +# ARM System Control and Management Interface Protocol +# +# CONFIG_ARM_SCMI_PROTOCOL is not set +# end of ARM System Control and Management Interface Protocol + +# CONFIG_ARM_SCPI_PROTOCOL is not set +# CONFIG_FIRMWARE_MEMMAP is not set +CONFIG_DMIID=y +# CONFIG_DMI_SYSFS is not set +CONFIG_RASPBERRYPI_FIRMWARE=y +# CONFIG_FW_CFG_SYSFS is not set +# CONFIG_SYSFB_SIMPLEFB is not set +# CONFIG_ARM_FFA_TRANSPORT is not set +CONFIG_CS_DSP=m +# CONFIG_GOOGLE_FIRMWARE is not set + +# +# EFI (Extensible Firmware Interface) Support +# +CONFIG_EFI_ESRT=y +# CONFIG_EFI_VARS_PSTORE is not set +CONFIG_EFI_PARAMS_FROM_FDT=y +CONFIG_EFI_RUNTIME_WRAPPERS=y +CONFIG_EFI_GENERIC_STUB=y +# CONFIG_EFI_ZBOOT is not set +CONFIG_EFI_ARMSTUB_DTB_LOADER=y +CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER=y +# CONFIG_EFI_BOOTLOADER_CONTROL is not set +# CONFIG_EFI_CAPSULE_LOADER is not set +# CONFIG_EFI_TEST is not set +# CONFIG_RESET_ATTACK_MITIGATION is not set +# CONFIG_EFI_DISABLE_PCI_DMA is not set +CONFIG_EFI_EARLYCON=y +# CONFIG_EFI_DISABLE_RUNTIME is not set +# CONFIG_EFI_COCO_SECRET is not set +# end of EFI (Extensible Firmware Interface) Support + +CONFIG_ARM_PSCI_FW=y +CONFIG_HAVE_ARM_SMCCC=y +CONFIG_HAVE_ARM_SMCCC_DISCOVERY=y +CONFIG_ARM_SMCCC_SOC_ID=y + +# +# Tegra firmware driver +# +# end of Tegra firmware driver +# end of Firmware Drivers + +# CONFIG_GNSS is not set +# CONFIG_MTD is not set +CONFIG_DTC=y +CONFIG_OF=y +# CONFIG_OF_UNITTEST is not set +CONFIG_OF_FLATTREE=y +CONFIG_OF_EARLY_FLATTREE=y +CONFIG_OF_KOBJ=y +CONFIG_OF_DYNAMIC=y +CONFIG_OF_ADDRESS=y +CONFIG_OF_IRQ=y +CONFIG_OF_RESERVED_MEM=y +CONFIG_OF_RESOLVE=y +CONFIG_OF_OVERLAY=y +CONFIG_OF_CONFIGFS=y +# CONFIG_PARPORT is not set +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_NULL_BLK is not set +CONFIG_CDROM=y +# CONFIG_BLK_DEV_PCIESSD_MTIP32XX is not set +# CONFIG_ZRAM is not set +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_LOOP_MIN_COUNT=0 +# CONFIG_BLK_DEV_DRBD is not set +CONFIG_BLK_DEV_NBD=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=4096 +# CONFIG_CDROM_PKTCDVD is not set +# CONFIG_ATA_OVER_ETH is not set +# CONFIG_BLK_DEV_RBD is not set +# CONFIG_BLK_DEV_UBLK is not set + +# +# NVME Support +# +CONFIG_NVME_CORE=y +CONFIG_BLK_DEV_NVME=y +# CONFIG_NVME_MULTIPATH is not set +# CONFIG_NVME_VERBOSE_ERRORS is not set +# CONFIG_NVME_HWMON is not set +# CONFIG_NVME_FC is not set +# CONFIG_NVME_TCP is not set +# CONFIG_NVME_AUTH is not set +# CONFIG_NVME_TARGET is not set +# end of NVME Support + +# +# Misc devices +# +CONFIG_BCM2835_SMI=m +# CONFIG_AD525X_DPOT is not set +# CONFIG_DUMMY_IRQ is not set +# CONFIG_PHANTOM is not set +# CONFIG_TIFM_CORE is not set +# CONFIG_ICS932S401 is not set +# CONFIG_ENCLOSURE_SERVICES is not set +# CONFIG_HP_ILO is not set +# CONFIG_APDS9802ALS is not set +# CONFIG_ISL29003 is not set +# CONFIG_ISL29020 is not set +# CONFIG_SENSORS_TSL2550 is not set +# CONFIG_SENSORS_BH1770 is not set +# CONFIG_SENSORS_APDS990X is not set +# CONFIG_HMC6352 is not set +# CONFIG_DS1682 is not set +# CONFIG_LATTICE_ECP3_CONFIG is not set +# CONFIG_SRAM is not set +# CONFIG_DW_XDATA_PCIE is not set +# CONFIG_PCI_ENDPOINT_TEST is not set +# CONFIG_XILINX_SDFEC is not set +CONFIG_MISC_RTSX=y +# CONFIG_HISI_HIKEY_USB is not set +# CONFIG_OPEN_DICE is not set +# CONFIG_VCPU_STALL_DETECTOR is not set +# CONFIG_C2PORT is not set + +# +# EEPROM support +# +# CONFIG_EEPROM_AT24 is not set +# CONFIG_EEPROM_AT25 is not set +# CONFIG_EEPROM_LEGACY is not set +# CONFIG_EEPROM_MAX6875 is not set +CONFIG_EEPROM_93CX6=m +# CONFIG_EEPROM_93XX46 is not set +# CONFIG_EEPROM_IDT_89HPESX is not set +# CONFIG_EEPROM_EE1004 is not set +# end of EEPROM support + +# CONFIG_CB710_CORE is not set + +# +# Texas Instruments shared transport line discipline +# +# CONFIG_TI_ST is not set +# end of Texas Instruments shared transport line discipline + +# CONFIG_SENSORS_LIS3_SPI is not set +# CONFIG_SENSORS_LIS3_I2C is not set +# CONFIG_ALTERA_STAPL is not set +# CONFIG_VMWARE_VMCI is not set +# CONFIG_GENWQE is not set +# CONFIG_ECHO is not set +# CONFIG_BCM_VK is not set +# CONFIG_MISC_ALCOR_PCI is not set +# CONFIG_MISC_RTSX_PCI is not set +CONFIG_MISC_RTSX_USB=y +# CONFIG_HABANA_AI is not set +# CONFIG_UACCE is not set +# CONFIG_PVPANIC is not set +# CONFIG_GP_PCI1XXXX is not set +# end of Misc devices + +# +# SCSI device support +# +CONFIG_SCSI_MOD=y +# CONFIG_RAID_ATTRS is not set +CONFIG_SCSI_COMMON=y +CONFIG_SCSI=y +CONFIG_SCSI_DMA=y +# CONFIG_SCSI_PROC_FS is not set + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=y +# CONFIG_CHR_DEV_ST is not set +CONFIG_BLK_DEV_SR=y +CONFIG_CHR_DEV_SG=m +CONFIG_BLK_DEV_BSG=y +# CONFIG_CHR_DEV_SCH is not set +# CONFIG_SCSI_CONSTANTS is not set +# CONFIG_SCSI_LOGGING is not set +# CONFIG_SCSI_SCAN_ASYNC is not set + +# +# SCSI Transports +# +# CONFIG_SCSI_SPI_ATTRS is not set +# CONFIG_SCSI_FC_ATTRS is not set +CONFIG_SCSI_ISCSI_ATTRS=y +# CONFIG_SCSI_SAS_ATTRS is not set +# CONFIG_SCSI_SAS_LIBSAS is not set +# CONFIG_SCSI_SRP_ATTRS is not set +# end of SCSI Transports + +CONFIG_SCSI_LOWLEVEL=y +CONFIG_ISCSI_TCP=y +CONFIG_ISCSI_BOOT_SYSFS=y +# CONFIG_SCSI_CXGB3_ISCSI is not set +# CONFIG_SCSI_CXGB4_ISCSI is not set +# CONFIG_SCSI_BNX2_ISCSI is not set +# CONFIG_BE2ISCSI is not set +# CONFIG_BLK_DEV_3W_XXXX_RAID is not set +# CONFIG_SCSI_HPSA is not set +# CONFIG_SCSI_3W_9XXX is not set +# CONFIG_SCSI_3W_SAS is not set +# CONFIG_SCSI_ACARD is not set +# CONFIG_SCSI_AACRAID is not set +# CONFIG_SCSI_AIC7XXX is not set +# CONFIG_SCSI_AIC79XX is not set +# CONFIG_SCSI_AIC94XX is not set +# CONFIG_SCSI_MVSAS is not set +# CONFIG_SCSI_MVUMI is not set +# CONFIG_SCSI_ADVANSYS is not set +# CONFIG_SCSI_ARCMSR is not set +# CONFIG_SCSI_ESAS2R is not set +# CONFIG_MEGARAID_NEWGEN is not set +# CONFIG_MEGARAID_LEGACY is not set +# CONFIG_MEGARAID_SAS is not set +# CONFIG_SCSI_MPT3SAS is not set +# CONFIG_SCSI_MPT2SAS is not set +# CONFIG_SCSI_MPI3MR is not set +# CONFIG_SCSI_SMARTPQI is not set +# CONFIG_SCSI_HPTIOP is not set +# CONFIG_SCSI_BUSLOGIC is not set +# CONFIG_SCSI_MYRB is not set +# CONFIG_SCSI_MYRS is not set +# CONFIG_SCSI_SNIC is not set +# CONFIG_SCSI_DMX3191D is not set +# CONFIG_SCSI_FDOMAIN_PCI is not set +# CONFIG_SCSI_IPS is not set +# CONFIG_SCSI_INITIO is not set +# CONFIG_SCSI_INIA100 is not set +# CONFIG_SCSI_STEX is not set +# CONFIG_SCSI_SYM53C8XX_2 is not set +# CONFIG_SCSI_QLOGIC_1280 is not set +# CONFIG_SCSI_QLA_ISCSI is not set +# CONFIG_SCSI_DC395x is not set +# CONFIG_SCSI_AM53C974 is not set +# CONFIG_SCSI_WD719X is not set +# CONFIG_SCSI_DEBUG is not set +# CONFIG_SCSI_PMCRAID is not set +# CONFIG_SCSI_PM8001 is not set +# CONFIG_SCSI_DH is not set +# end of SCSI device support + +# CONFIG_ATA is not set +CONFIG_MD=y +# CONFIG_BLK_DEV_MD is not set +# CONFIG_BCACHE is not set +CONFIG_BLK_DEV_DM_BUILTIN=y +CONFIG_BLK_DEV_DM=m +# CONFIG_DM_DEBUG is not set +CONFIG_DM_BUFIO=m +# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set +CONFIG_DM_BIO_PRISON=m +CONFIG_DM_PERSISTENT_DATA=m +# CONFIG_DM_UNSTRIPED is not set +# CONFIG_DM_CRYPT is not set +# CONFIG_DM_SNAPSHOT is not set +CONFIG_DM_THIN_PROVISIONING=m +# CONFIG_DM_CACHE is not set +# CONFIG_DM_WRITECACHE is not set +# CONFIG_DM_EBS is not set +# CONFIG_DM_ERA is not set +# CONFIG_DM_CLONE is not set +# CONFIG_DM_MIRROR is not set +# CONFIG_DM_RAID is not set +# CONFIG_DM_ZERO is not set +# CONFIG_DM_MULTIPATH is not set +# CONFIG_DM_DELAY is not set +# CONFIG_DM_DUST is not set +# CONFIG_DM_UEVENT is not set +# CONFIG_DM_FLAKEY is not set +# CONFIG_DM_VERITY is not set +# CONFIG_DM_SWITCH is not set +# CONFIG_DM_LOG_WRITES is not set +# CONFIG_DM_INTEGRITY is not set +# CONFIG_TARGET_CORE is not set +# CONFIG_FUSION is not set + +# +# IEEE 1394 (FireWire) support +# +# CONFIG_FIREWIRE is not set +# CONFIG_FIREWIRE_NOSY is not set +# end of IEEE 1394 (FireWire) support + +CONFIG_NETDEVICES=y +CONFIG_MII=y +CONFIG_NET_CORE=y +# CONFIG_BONDING is not set +CONFIG_DUMMY=m +CONFIG_WIREGUARD=m +# CONFIG_WIREGUARD_DEBUG is not set +# CONFIG_EQUALIZER is not set +# CONFIG_NET_FC is not set +# CONFIG_NET_TEAM is not set +CONFIG_MACVLAN=m +# CONFIG_MACVTAP is not set +CONFIG_IPVLAN_L3S=y +CONFIG_IPVLAN=m +# CONFIG_IPVTAP is not set +CONFIG_VXLAN=m +# CONFIG_GENEVE is not set +# CONFIG_BAREUDP is not set +# CONFIG_GTP is not set +# CONFIG_AMT is not set +# CONFIG_MACSEC is not set +CONFIG_NETCONSOLE=y +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETPOLL=y +CONFIG_NET_POLL_CONTROLLER=y +CONFIG_TUN=y +# CONFIG_TUN_VNET_CROSS_LE is not set +CONFIG_VETH=m +CONFIG_NLMON=m +# CONFIG_ARCNET is not set +CONFIG_ETHERNET=y +CONFIG_NET_VENDOR_3COM=y +# CONFIG_VORTEX is not set +# CONFIG_TYPHOON is not set +CONFIG_NET_VENDOR_ADAPTEC=y +# CONFIG_ADAPTEC_STARFIRE is not set +CONFIG_NET_VENDOR_AGERE=y +# CONFIG_ET131X is not set +CONFIG_NET_VENDOR_ALACRITECH=y +# CONFIG_SLICOSS is not set +CONFIG_NET_VENDOR_ALTEON=y +# CONFIG_ACENIC is not set +# CONFIG_ALTERA_TSE is not set +CONFIG_NET_VENDOR_AMAZON=y +# CONFIG_ENA_ETHERNET is not set +CONFIG_NET_VENDOR_AMD=y +# CONFIG_AMD8111_ETH is not set +# CONFIG_PCNET32 is not set +# CONFIG_AMD_XGBE is not set +CONFIG_NET_VENDOR_AQUANTIA=y +# CONFIG_AQTION is not set +CONFIG_NET_VENDOR_ARC=y +CONFIG_NET_VENDOR_ASIX=y +# CONFIG_SPI_AX88796C is not set +CONFIG_NET_VENDOR_ATHEROS=y +# CONFIG_ATL2 is not set +# CONFIG_ATL1 is not set +# CONFIG_ATL1E is not set +# CONFIG_ATL1C is not set +# CONFIG_ALX is not set +CONFIG_NET_VENDOR_BROADCOM=y +# CONFIG_B44 is not set +CONFIG_BCMGENET=y +# CONFIG_BNX2 is not set +# CONFIG_CNIC is not set +# CONFIG_TIGON3 is not set +# CONFIG_BNX2X is not set +# CONFIG_SYSTEMPORT is not set +# CONFIG_BNXT is not set +CONFIG_NET_VENDOR_CADENCE=y +CONFIG_MACB=y +# CONFIG_MACB_PCI is not set +CONFIG_NET_VENDOR_CAVIUM=y +# CONFIG_THUNDER_NIC_PF is not set +# CONFIG_THUNDER_NIC_VF is not set +# CONFIG_THUNDER_NIC_BGX is not set +# CONFIG_THUNDER_NIC_RGX is not set +# CONFIG_LIQUIDIO is not set +# CONFIG_LIQUIDIO_VF is not set +CONFIG_NET_VENDOR_CHELSIO=y +# CONFIG_CHELSIO_T1 is not set +# CONFIG_CHELSIO_T3 is not set +# CONFIG_CHELSIO_T4 is not set +# CONFIG_CHELSIO_T4VF is not set +CONFIG_NET_VENDOR_CISCO=y +# CONFIG_ENIC is not set +CONFIG_NET_VENDOR_CORTINA=y +# CONFIG_GEMINI_ETHERNET is not set +CONFIG_NET_VENDOR_DAVICOM=y +# CONFIG_DM9051 is not set +# CONFIG_DNET is not set +CONFIG_NET_VENDOR_DEC=y +# CONFIG_NET_TULIP is not set +CONFIG_NET_VENDOR_DLINK=y +# CONFIG_DL2K is not set +# CONFIG_SUNDANCE is not set +CONFIG_NET_VENDOR_EMULEX=y +# CONFIG_BE2NET is not set +CONFIG_NET_VENDOR_ENGLEDER=y +# CONFIG_TSNEP is not set +CONFIG_NET_VENDOR_EZCHIP=y +# CONFIG_EZCHIP_NPS_MANAGEMENT_ENET is not set +CONFIG_NET_VENDOR_FUNGIBLE=y +# CONFIG_FUN_ETH is not set +CONFIG_NET_VENDOR_GOOGLE=y +# CONFIG_GVE is not set +CONFIG_NET_VENDOR_HISILICON=y +# CONFIG_HIX5HD2_GMAC is not set +# CONFIG_HISI_FEMAC is not set +# CONFIG_HIP04_ETH is not set +# CONFIG_HNS_DSAF is not set +# CONFIG_HNS_ENET is not set +# CONFIG_HNS3 is not set +CONFIG_NET_VENDOR_HUAWEI=y +# CONFIG_HINIC is not set +CONFIG_NET_VENDOR_I825XX=y +CONFIG_NET_VENDOR_INTEL=y +# CONFIG_E100 is not set +# CONFIG_E1000 is not set +# CONFIG_E1000E is not set +# CONFIG_IGB is not set +# CONFIG_IGBVF is not set +# CONFIG_IXGB is not set +# CONFIG_IXGBE is not set +# CONFIG_IXGBEVF is not set +# CONFIG_I40E is not set +# CONFIG_I40EVF is not set +# CONFIG_ICE is not set +# CONFIG_FM10K is not set +# CONFIG_IGC is not set +CONFIG_NET_VENDOR_WANGXUN=y +# CONFIG_NGBE is not set +# CONFIG_TXGBE is not set +# CONFIG_JME is not set +CONFIG_NET_VENDOR_ADI=y +CONFIG_NET_VENDOR_LITEX=y +# CONFIG_LITEX_LITEETH is not set +CONFIG_NET_VENDOR_MARVELL=y +# CONFIG_MVMDIO is not set +# CONFIG_SKGE is not set +# CONFIG_SKY2 is not set +# CONFIG_OCTEONTX2_AF is not set +# CONFIG_OCTEONTX2_PF is not set +# CONFIG_OCTEON_EP is not set +CONFIG_NET_VENDOR_MELLANOX=y +# CONFIG_MLX4_EN is not set +# CONFIG_MLX5_CORE is not set +# CONFIG_MLXSW_CORE is not set +# CONFIG_MLXFW is not set +CONFIG_NET_VENDOR_MICREL=y +# CONFIG_KS8842 is not set +# CONFIG_KS8851 is not set +# CONFIG_KS8851_MLL is not set +# CONFIG_KSZ884X_PCI is not set +CONFIG_NET_VENDOR_MICROCHIP=y +# CONFIG_ENC28J60 is not set +# CONFIG_ENCX24J600 is not set +# CONFIG_LAN743X is not set +CONFIG_NET_VENDOR_MICROSEMI=y +CONFIG_NET_VENDOR_MICROSOFT=y +CONFIG_NET_VENDOR_MYRI=y +# CONFIG_MYRI10GE is not set +# CONFIG_FEALNX is not set +CONFIG_NET_VENDOR_NI=y +# CONFIG_NI_XGE_MANAGEMENT_ENET is not set +CONFIG_NET_VENDOR_NATSEMI=y +# CONFIG_NATSEMI is not set +# CONFIG_NS83820 is not set +CONFIG_NET_VENDOR_NETERION=y +# CONFIG_S2IO is not set +CONFIG_NET_VENDOR_NETRONOME=y +# CONFIG_NFP is not set +CONFIG_NET_VENDOR_8390=y +# CONFIG_NE2K_PCI is not set +CONFIG_NET_VENDOR_NVIDIA=y +# CONFIG_FORCEDETH is not set +CONFIG_NET_VENDOR_OKI=y +# CONFIG_ETHOC is not set +CONFIG_NET_VENDOR_PACKET_ENGINES=y +# CONFIG_HAMACHI is not set +# CONFIG_YELLOWFIN is not set +# CONFIG_NET_VENDOR_PENSANDO is not set +CONFIG_NET_VENDOR_QLOGIC=y +# CONFIG_QLA3XXX is not set +# CONFIG_QLCNIC is not set +# CONFIG_NETXEN_NIC is not set +# CONFIG_QED is not set +CONFIG_NET_VENDOR_BROCADE=y +# CONFIG_BNA is not set +CONFIG_NET_VENDOR_QUALCOMM=y +# CONFIG_QCA7000_SPI is not set +# CONFIG_QCA7000_UART is not set +# CONFIG_QCOM_EMAC is not set +# CONFIG_RMNET is not set +CONFIG_NET_VENDOR_RDC=y +# CONFIG_R6040 is not set +CONFIG_NET_VENDOR_REALTEK=y +# CONFIG_8139CP is not set +# CONFIG_8139TOO is not set +# CONFIG_R8169 is not set +CONFIG_NET_VENDOR_RENESAS=y +CONFIG_NET_VENDOR_ROCKER=y +CONFIG_NET_VENDOR_SAMSUNG=y +# CONFIG_SXGBE_ETH is not set +CONFIG_NET_VENDOR_SEEQ=y +CONFIG_NET_VENDOR_SILAN=y +# CONFIG_SC92031 is not set +CONFIG_NET_VENDOR_SIS=y +# CONFIG_SIS900 is not set +# CONFIG_SIS190 is not set +CONFIG_NET_VENDOR_SOLARFLARE=y +# CONFIG_SFC is not set +# CONFIG_SFC_FALCON is not set +CONFIG_NET_VENDOR_SMSC=y +# CONFIG_SMC91X is not set +# CONFIG_EPIC100 is not set +# CONFIG_SMSC911X is not set +# CONFIG_SMSC9420 is not set +CONFIG_NET_VENDOR_SOCIONEXT=y +CONFIG_NET_VENDOR_STMICRO=y +# CONFIG_STMMAC_ETH is not set +CONFIG_NET_VENDOR_SUN=y +# CONFIG_HAPPYMEAL is not set +# CONFIG_SUNGEM is not set +# CONFIG_CASSINI is not set +# CONFIG_NIU is not set +CONFIG_NET_VENDOR_SYNOPSYS=y +# CONFIG_DWC_XLGMAC is not set +CONFIG_NET_VENDOR_TEHUTI=y +# CONFIG_TEHUTI is not set +CONFIG_NET_VENDOR_TI=y +# CONFIG_TI_CPSW_PHY_SEL is not set +# CONFIG_TLAN is not set +CONFIG_NET_VENDOR_VERTEXCOM=y +# CONFIG_MSE102X is not set +CONFIG_NET_VENDOR_VIA=y +# CONFIG_VIA_RHINE is not set +# CONFIG_VIA_VELOCITY is not set +CONFIG_NET_VENDOR_WIZNET=y +# CONFIG_WIZNET_W5100 is not set +# CONFIG_WIZNET_W5300 is not set +CONFIG_NET_VENDOR_XILINX=y +# CONFIG_XILINX_EMACLITE is not set +# CONFIG_XILINX_AXI_EMAC is not set +# CONFIG_XILINX_LL_TEMAC is not set +# CONFIG_FDDI is not set +# CONFIG_HIPPI is not set +CONFIG_PHYLINK=y +CONFIG_PHYLIB=y +CONFIG_SWPHY=y +# CONFIG_LED_TRIGGER_PHY is not set +CONFIG_FIXED_PHY=y +# CONFIG_SFP is not set + +# +# MII PHY device drivers +# +# CONFIG_AMD_PHY is not set +# CONFIG_ADIN_PHY is not set +# CONFIG_ADIN1100_PHY is not set +# CONFIG_AQUANTIA_PHY is not set +CONFIG_AX88796B_PHY=m +CONFIG_BROADCOM_PHY=y +# CONFIG_BCM54140_PHY is not set +CONFIG_BCM7XXX_PHY=y +# CONFIG_BCM84881_PHY is not set +# CONFIG_BCM87XX_PHY is not set +CONFIG_BCM_NET_PHYLIB=y +# CONFIG_CICADA_PHY is not set +# CONFIG_CORTINA_PHY is not set +# CONFIG_DAVICOM_PHY is not set +# CONFIG_ICPLUS_PHY is not set +# CONFIG_LXT_PHY is not set +# CONFIG_INTEL_XWAY_PHY is not set +# CONFIG_LSI_ET1011C_PHY is not set +# CONFIG_MARVELL_PHY is not set +# CONFIG_MARVELL_10G_PHY is not set +# CONFIG_MARVELL_88X2222_PHY is not set +# CONFIG_MAXLINEAR_GPHY is not set +# CONFIG_MEDIATEK_GE_PHY is not set +CONFIG_MICREL_PHY=y +CONFIG_MICROCHIP_PHY=y +# CONFIG_MICROCHIP_T1_PHY is not set +# CONFIG_MICROSEMI_PHY is not set +# CONFIG_MOTORCOMM_PHY is not set +# CONFIG_NATIONAL_PHY is not set +# CONFIG_NXP_C45_TJA11XX_PHY is not set +# CONFIG_NXP_TJA11XX_PHY is not set +# CONFIG_AT803X_PHY is not set +# CONFIG_QSEMI_PHY is not set +# CONFIG_REALTEK_PHY is not set +# CONFIG_RENESAS_PHY is not set +# CONFIG_ROCKCHIP_PHY is not set +CONFIG_SMSC_PHY=y +# CONFIG_STE10XP is not set +# CONFIG_TERANETICS_PHY is not set +# CONFIG_DP83822_PHY is not set +# CONFIG_DP83TC811_PHY is not set +# CONFIG_DP83848_PHY is not set +# CONFIG_DP83867_PHY is not set +# CONFIG_DP83869_PHY is not set +# CONFIG_DP83TD510_PHY is not set +# CONFIG_VITESSE_PHY is not set +# CONFIG_XILINX_GMII2RGMII is not set +# CONFIG_MICREL_KS8995MA is not set +# CONFIG_PSE_CONTROLLER is not set +CONFIG_MDIO_DEVICE=y +CONFIG_MDIO_BUS=y +CONFIG_FWNODE_MDIO=y +CONFIG_OF_MDIO=y +CONFIG_MDIO_DEVRES=y +# CONFIG_MDIO_BITBANG is not set +CONFIG_MDIO_BCM_UNIMAC=y +# CONFIG_MDIO_HISI_FEMAC is not set +# CONFIG_MDIO_MVUSB is not set +# CONFIG_MDIO_MSCC_MIIM is not set +# CONFIG_MDIO_OCTEON is not set +# CONFIG_MDIO_IPQ4019 is not set +# CONFIG_MDIO_IPQ8064 is not set +# CONFIG_MDIO_THUNDER is not set + +# +# MDIO Multiplexers +# +# CONFIG_MDIO_BUS_MUX_GPIO is not set +# CONFIG_MDIO_BUS_MUX_MULTIPLEXER is not set +# CONFIG_MDIO_BUS_MUX_MMIOREG is not set + +# +# PCS device drivers +# +# end of PCS device drivers + +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +# CONFIG_PPP_FILTER is not set +CONFIG_PPP_MPPE=m +# CONFIG_PPP_MULTILINK is not set +CONFIG_PPPOE=m +CONFIG_PPP_ASYNC=m +# CONFIG_PPP_SYNC_TTY is not set +# CONFIG_SLIP is not set +CONFIG_SLHC=m +CONFIG_USB_NET_DRIVERS=y +# CONFIG_USB_CATC is not set +# CONFIG_USB_KAWETH is not set +# CONFIG_USB_PEGASUS is not set +# CONFIG_USB_RTL8150 is not set +CONFIG_USB_RTL8152=m +CONFIG_USB_LAN78XX=y +CONFIG_USB_USBNET=y +CONFIG_USB_NET_AX8817X=m +CONFIG_USB_NET_AX88179_178A=m +CONFIG_USB_NET_CDCETHER=m +# CONFIG_USB_NET_CDC_EEM is not set +# CONFIG_USB_NET_CDC_NCM is not set +# CONFIG_USB_NET_HUAWEI_CDC_NCM is not set +# CONFIG_USB_NET_CDC_MBIM is not set +CONFIG_USB_NET_DM9601=y +# CONFIG_USB_NET_SR9700 is not set +# CONFIG_USB_NET_SR9800 is not set +CONFIG_USB_NET_SMSC75XX=m +CONFIG_USB_NET_SMSC95XX=y +# CONFIG_USB_NET_GL620A is not set +# CONFIG_USB_NET_NET1080 is not set +# CONFIG_USB_NET_PLUSB is not set +CONFIG_USB_NET_MCS7830=m +CONFIG_USB_NET_RNDIS_HOST=m +# CONFIG_USB_NET_CDC_SUBSET is not set +# CONFIG_USB_NET_ZAURUS is not set +# CONFIG_USB_NET_CX82310_ETH is not set +# CONFIG_USB_NET_KALMIA is not set +# CONFIG_USB_NET_QMI_WWAN is not set +CONFIG_USB_HSO=m +# CONFIG_USB_NET_INT51X1 is not set +CONFIG_USB_IPHETH=m +# CONFIG_USB_SIERRA_NET is not set +# CONFIG_USB_VL600 is not set +# CONFIG_USB_NET_CH9200 is not set +# CONFIG_USB_NET_AQC111 is not set +CONFIG_USB_RTL8153_ECM=m +CONFIG_WLAN=y +CONFIG_WLAN_VENDOR_ADMTEK=y +# CONFIG_ADM8211 is not set +CONFIG_ATH_COMMON=m +CONFIG_WLAN_VENDOR_ATH=y +# CONFIG_ATH_DEBUG is not set +# CONFIG_ATH5K is not set +# CONFIG_ATH5K_PCI is not set +CONFIG_ATH9K_HW=m +CONFIG_ATH9K_COMMON=m +CONFIG_ATH9K_BTCOEX_SUPPORT=y +CONFIG_ATH9K=m +CONFIG_ATH9K_PCI=y +CONFIG_ATH9K_AHB=y +# CONFIG_ATH9K_DEBUGFS is not set +# CONFIG_ATH9K_DYNACK is not set +# CONFIG_ATH9K_WOW is not set +CONFIG_ATH9K_RFKILL=y +CONFIG_ATH9K_CHANNEL_CONTEXT=y +CONFIG_ATH9K_PCOEM=y +# CONFIG_ATH9K_PCI_NO_EEPROM is not set +CONFIG_ATH9K_HTC=m +# CONFIG_ATH9K_HTC_DEBUGFS is not set +CONFIG_ATH9K_HWRNG=y +CONFIG_CARL9170=m +CONFIG_CARL9170_LEDS=y +CONFIG_CARL9170_WPC=y +CONFIG_CARL9170_HWRNG=y +CONFIG_ATH6KL=m +# CONFIG_ATH6KL_SDIO is not set +CONFIG_ATH6KL_USB=m +# CONFIG_ATH6KL_DEBUG is not set +# CONFIG_ATH6KL_TRACING is not set +CONFIG_AR5523=m +# CONFIG_WIL6210 is not set +# CONFIG_ATH10K is not set +CONFIG_WCN36XX=m +# CONFIG_WCN36XX_DEBUGFS is not set +CONFIG_WLAN_VENDOR_ATMEL=y +# CONFIG_ATMEL is not set +# CONFIG_AT76C50X_USB is not set +CONFIG_WLAN_VENDOR_BROADCOM=y +CONFIG_B43=m +CONFIG_B43_BCMA=y +CONFIG_B43_SSB=y +CONFIG_B43_BUSES_BCMA_AND_SSB=y +# CONFIG_B43_BUSES_BCMA is not set +# CONFIG_B43_BUSES_SSB is not set +CONFIG_B43_PCI_AUTOSELECT=y +CONFIG_B43_PCICORE_AUTOSELECT=y +# CONFIG_B43_SDIO is not set +CONFIG_B43_BCMA_PIO=y +CONFIG_B43_PIO=y +CONFIG_B43_PHY_G=y +CONFIG_B43_PHY_N=y +CONFIG_B43_PHY_LP=y +CONFIG_B43_PHY_HT=y +CONFIG_B43_LEDS=y +CONFIG_B43_HWRNG=y +# CONFIG_B43_DEBUG is not set +# CONFIG_B43LEGACY is not set +CONFIG_BRCMUTIL=m +# CONFIG_BRCMSMAC is not set +CONFIG_BRCMFMAC=m +CONFIG_BRCMFMAC_PROTO_BCDC=y +CONFIG_BRCMFMAC_SDIO=y +CONFIG_BRCMFMAC_USB=y +# CONFIG_BRCMFMAC_PCIE is not set +# CONFIG_BRCM_TRACING is not set +CONFIG_BRCMDBG=y +CONFIG_WLAN_VENDOR_CISCO=y +CONFIG_WLAN_VENDOR_INTEL=y +# CONFIG_IPW2100 is not set +# CONFIG_IPW2200 is not set +# CONFIG_IWL4965 is not set +# CONFIG_IWL3945 is not set +# CONFIG_IWLWIFI is not set +CONFIG_WLAN_VENDOR_INTERSIL=y +# CONFIG_HOSTAP is not set +# CONFIG_HERMES is not set +CONFIG_P54_COMMON=m +CONFIG_P54_USB=m +# CONFIG_P54_PCI is not set +# CONFIG_P54_SPI is not set +CONFIG_P54_LEDS=y +CONFIG_WLAN_VENDOR_MARVELL=y +# CONFIG_LIBERTAS is not set +# CONFIG_LIBERTAS_THINFIRM is not set +# CONFIG_MWIFIEX is not set +# CONFIG_MWL8K is not set +CONFIG_WLAN_VENDOR_MEDIATEK=y +CONFIG_MT7601U=m +CONFIG_MT76_CORE=m +CONFIG_MT76_LEDS=y +CONFIG_MT76_USB=m +CONFIG_MT76x02_LIB=m +CONFIG_MT76x02_USB=m +CONFIG_MT76_CONNAC_LIB=m +CONFIG_MT76x0_COMMON=m +CONFIG_MT76x0U=m +# CONFIG_MT76x0E is not set +CONFIG_MT76x2_COMMON=m +# CONFIG_MT76x2E is not set +CONFIG_MT76x2U=m +# CONFIG_MT7603E is not set +CONFIG_MT7615_COMMON=m +# CONFIG_MT7615E is not set +CONFIG_MT7663_USB_SDIO_COMMON=m +CONFIG_MT7663U=m +# CONFIG_MT7663S is not set +# CONFIG_MT7915E is not set +# CONFIG_MT7921E is not set +# CONFIG_MT7921S is not set +# CONFIG_MT7921U is not set +CONFIG_WLAN_VENDOR_MICROCHIP=y +# CONFIG_WILC1000_SDIO is not set +# CONFIG_WILC1000_SPI is not set +CONFIG_WLAN_VENDOR_PURELIFI=y +# CONFIG_PLFXLC is not set +CONFIG_WLAN_VENDOR_RALINK=y +CONFIG_RT2X00=m +# CONFIG_RT2400PCI is not set +# CONFIG_RT2500PCI is not set +# CONFIG_RT61PCI is not set +# CONFIG_RT2800PCI is not set +CONFIG_RT2500USB=m +CONFIG_RT73USB=m +CONFIG_RT2800USB=m +CONFIG_RT2800USB_RT33XX=y +CONFIG_RT2800USB_RT35XX=y +CONFIG_RT2800USB_RT3573=y +CONFIG_RT2800USB_RT53XX=y +CONFIG_RT2800USB_RT55XX=y +CONFIG_RT2800USB_UNKNOWN=y +CONFIG_RT2800_LIB=m +CONFIG_RT2X00_LIB_USB=m +CONFIG_RT2X00_LIB=m +CONFIG_RT2X00_LIB_FIRMWARE=y +CONFIG_RT2X00_LIB_CRYPTO=y +CONFIG_RT2X00_LIB_LEDS=y +# CONFIG_RT2X00_DEBUG is not set +CONFIG_WLAN_VENDOR_REALTEK=y +# CONFIG_RTL8180 is not set +CONFIG_RTL8187=m +CONFIG_RTL8187_LEDS=y +CONFIG_RTL_CARDS=m +# CONFIG_RTL8192CE is not set +# CONFIG_RTL8192SE is not set +# CONFIG_RTL8192DE is not set +# CONFIG_RTL8723AE is not set +# CONFIG_RTL8723BE is not set +# CONFIG_RTL8188EE is not set +# CONFIG_RTL8192EE is not set +# CONFIG_RTL8821AE is not set +# CONFIG_RTL8192CU is not set +CONFIG_RTL8XXXU=m +CONFIG_RTL8XXXU_UNTESTED=y +CONFIG_RTW88=m +# CONFIG_RTW88_8822BE is not set +# CONFIG_RTW88_8822CE is not set +# CONFIG_RTW88_8723DE is not set +# CONFIG_RTW88_8821CE is not set +# CONFIG_RTW89 is not set +CONFIG_WLAN_VENDOR_RSI=y +# CONFIG_RSI_91X is not set +CONFIG_WLAN_VENDOR_SILABS=y +# CONFIG_WFX is not set +CONFIG_WLAN_VENDOR_ST=y +# CONFIG_CW1200 is not set +CONFIG_WLAN_VENDOR_TI=y +# CONFIG_WL1251 is not set +# CONFIG_WL12XX is not set +# CONFIG_WL18XX is not set +# CONFIG_WLCORE is not set +CONFIG_WLAN_VENDOR_ZYDAS=y +CONFIG_USB_ZD1201=m +CONFIG_ZD1211RW=m +# CONFIG_ZD1211RW_DEBUG is not set +# CONFIG_WLAN_VENDOR_QUANTENNA is not set +# CONFIG_MAC80211_HWSIM is not set +CONFIG_USB_NET_RNDIS_WLAN=m +# CONFIG_VIRT_WIFI is not set +# CONFIG_WAN is not set + +# +# Wireless WAN +# +# CONFIG_WWAN is not set +# end of Wireless WAN + +# CONFIG_VMXNET3 is not set +# CONFIG_NETDEVSIM is not set +# CONFIG_NET_FAILOVER is not set +# CONFIG_ISDN is not set + +# +# Input device support +# +CONFIG_INPUT=y +CONFIG_INPUT_LEDS=y +CONFIG_INPUT_FF_MEMLESS=y +# CONFIG_INPUT_SPARSEKMAP is not set +# CONFIG_INPUT_MATRIXKMAP is not set + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=y +# CONFIG_INPUT_MOUSEDEV_PSAUX is not set +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=y +CONFIG_INPUT_EVDEV=y +# CONFIG_INPUT_EVBUG is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +# CONFIG_KEYBOARD_ADP5588 is not set +# CONFIG_KEYBOARD_ADP5589 is not set +# CONFIG_KEYBOARD_ATKBD is not set +# CONFIG_KEYBOARD_QT1050 is not set +# CONFIG_KEYBOARD_QT1070 is not set +# CONFIG_KEYBOARD_QT2160 is not set +# CONFIG_KEYBOARD_DLINK_DIR685 is not set +# CONFIG_KEYBOARD_LKKBD is not set +CONFIG_KEYBOARD_GPIO=m +# CONFIG_KEYBOARD_GPIO_POLLED is not set +# CONFIG_KEYBOARD_TCA6416 is not set +# CONFIG_KEYBOARD_TCA8418 is not set +# CONFIG_KEYBOARD_MATRIX is not set +# CONFIG_KEYBOARD_LM8323 is not set +# CONFIG_KEYBOARD_LM8333 is not set +# CONFIG_KEYBOARD_MAX7359 is not set +# CONFIG_KEYBOARD_MCS is not set +# CONFIG_KEYBOARD_MPR121 is not set +# CONFIG_KEYBOARD_NEWTON is not set +# CONFIG_KEYBOARD_OPENCORES is not set +# CONFIG_KEYBOARD_PINEPHONE is not set +# CONFIG_KEYBOARD_SAMSUNG is not set +# CONFIG_KEYBOARD_STOWAWAY is not set +# CONFIG_KEYBOARD_SUNKBD is not set +# CONFIG_KEYBOARD_OMAP4 is not set +# CONFIG_KEYBOARD_TM2_TOUCHKEY is not set +# CONFIG_KEYBOARD_XTKBD is not set +# CONFIG_KEYBOARD_CAP11XX is not set +# CONFIG_KEYBOARD_BCM is not set +# CONFIG_KEYBOARD_CYPRESS_SF is not set +# CONFIG_INPUT_MOUSE is not set +CONFIG_INPUT_JOYSTICK=y +# CONFIG_JOYSTICK_ANALOG is not set +# CONFIG_JOYSTICK_A3D is not set +# CONFIG_JOYSTICK_ADI is not set +# CONFIG_JOYSTICK_COBRA is not set +# CONFIG_JOYSTICK_GF2K is not set +# CONFIG_JOYSTICK_GRIP is not set +# CONFIG_JOYSTICK_GRIP_MP is not set +# CONFIG_JOYSTICK_GUILLEMOT is not set +# CONFIG_JOYSTICK_INTERACT is not set +# CONFIG_JOYSTICK_SIDEWINDER is not set +# CONFIG_JOYSTICK_TMDC is not set +# CONFIG_JOYSTICK_IFORCE is not set +# CONFIG_JOYSTICK_WARRIOR is not set +# CONFIG_JOYSTICK_MAGELLAN is not set +# CONFIG_JOYSTICK_SPACEORB is not set +# CONFIG_JOYSTICK_SPACEBALL is not set +# CONFIG_JOYSTICK_STINGER is not set +# CONFIG_JOYSTICK_TWIDJOY is not set +# CONFIG_JOYSTICK_ZHENHUA is not set +# CONFIG_JOYSTICK_AS5011 is not set +# CONFIG_JOYSTICK_JOYDUMP is not set +CONFIG_JOYSTICK_XPAD=m +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_JOYSTICK_PSXPAD_SPI=m +CONFIG_JOYSTICK_PSXPAD_SPI_FF=y +# CONFIG_JOYSTICK_PXRC is not set +# CONFIG_JOYSTICK_QWIIC is not set +# CONFIG_JOYSTICK_FSIA6B is not set +# CONFIG_JOYSTICK_SENSEHAT is not set +# CONFIG_JOYSTICK_RPISENSE is not set +# CONFIG_INPUT_TABLET is not set +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_ADS7846=m +# CONFIG_TOUCHSCREEN_AD7877 is not set +# CONFIG_TOUCHSCREEN_AD7879 is not set +# CONFIG_TOUCHSCREEN_AR1021_I2C is not set +# CONFIG_TOUCHSCREEN_ATMEL_MXT is not set +# CONFIG_TOUCHSCREEN_AUO_PIXCIR is not set +# CONFIG_TOUCHSCREEN_BU21013 is not set +# CONFIG_TOUCHSCREEN_BU21029 is not set +# CONFIG_TOUCHSCREEN_CHIPONE_ICN8318 is not set +# CONFIG_TOUCHSCREEN_CY8CTMA140 is not set +# CONFIG_TOUCHSCREEN_CY8CTMG110 is not set +# CONFIG_TOUCHSCREEN_CYTTSP_CORE is not set +# CONFIG_TOUCHSCREEN_CYTTSP4_CORE is not set +# CONFIG_TOUCHSCREEN_DYNAPRO is not set +# CONFIG_TOUCHSCREEN_HAMPSHIRE is not set +# CONFIG_TOUCHSCREEN_EETI is not set +CONFIG_TOUCHSCREEN_EGALAX=m +# CONFIG_TOUCHSCREEN_EGALAX_SERIAL is not set +# CONFIG_TOUCHSCREEN_EXC3000 is not set +# CONFIG_TOUCHSCREEN_FUJITSU is not set +# CONFIG_TOUCHSCREEN_GOODIX is not set +# CONFIG_TOUCHSCREEN_HIDEEP is not set +# CONFIG_TOUCHSCREEN_HYCON_HY46XX is not set +# CONFIG_TOUCHSCREEN_ILI210X is not set +# CONFIG_TOUCHSCREEN_ILITEK is not set +# CONFIG_TOUCHSCREEN_S6SY761 is not set +# CONFIG_TOUCHSCREEN_GUNZE is not set +# CONFIG_TOUCHSCREEN_EKTF2127 is not set +# CONFIG_TOUCHSCREEN_ELAN is not set +# CONFIG_TOUCHSCREEN_ELO is not set +# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set +# CONFIG_TOUCHSCREEN_WACOM_I2C is not set +# CONFIG_TOUCHSCREEN_MAX11801 is not set +# CONFIG_TOUCHSCREEN_MCS5000 is not set +# CONFIG_TOUCHSCREEN_MMS114 is not set +# CONFIG_TOUCHSCREEN_MELFAS_MIP4 is not set +# CONFIG_TOUCHSCREEN_MSG2638 is not set +# CONFIG_TOUCHSCREEN_MTOUCH is not set +# CONFIG_TOUCHSCREEN_IMAGIS is not set +# CONFIG_TOUCHSCREEN_IMX6UL_TSC is not set +# CONFIG_TOUCHSCREEN_INEXIO is not set +# CONFIG_TOUCHSCREEN_MK712 is not set +# CONFIG_TOUCHSCREEN_PENMOUNT is not set +CONFIG_TOUCHSCREEN_EDT_FT5X06=m +CONFIG_TOUCHSCREEN_RASPBERRYPI_FW=m +# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set +# CONFIG_TOUCHSCREEN_TOUCHWIN is not set +# CONFIG_TOUCHSCREEN_PIXCIR is not set +# CONFIG_TOUCHSCREEN_WDT87XX_I2C is not set +CONFIG_TOUCHSCREEN_USB_COMPOSITE=m +CONFIG_TOUCHSCREEN_USB_EGALAX=y +# CONFIG_TOUCHSCREEN_USB_PANJIT is not set +CONFIG_TOUCHSCREEN_USB_3M=y +# CONFIG_TOUCHSCREEN_USB_ITM is not set +# CONFIG_TOUCHSCREEN_USB_ETURBO is not set +# CONFIG_TOUCHSCREEN_USB_GUNZE is not set +# CONFIG_TOUCHSCREEN_USB_DMC_TSC10 is not set +# CONFIG_TOUCHSCREEN_USB_IRTOUCH is not set +# CONFIG_TOUCHSCREEN_USB_IDEALTEK is not set +# CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH is not set +# CONFIG_TOUCHSCREEN_USB_GOTOP is not set +# CONFIG_TOUCHSCREEN_USB_JASTEC is not set +# CONFIG_TOUCHSCREEN_USB_ELO is not set +# CONFIG_TOUCHSCREEN_USB_E2I is not set +# CONFIG_TOUCHSCREEN_USB_ZYTRONIC is not set +# CONFIG_TOUCHSCREEN_USB_ETT_TC45USB is not set +# CONFIG_TOUCHSCREEN_USB_NEXIO is not set +# CONFIG_TOUCHSCREEN_USB_EASYTOUCH is not set +# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set +# CONFIG_TOUCHSCREEN_TSC_SERIO is not set +# CONFIG_TOUCHSCREEN_TSC2004 is not set +# CONFIG_TOUCHSCREEN_TSC2005 is not set +# CONFIG_TOUCHSCREEN_TSC2007 is not set +# CONFIG_TOUCHSCREEN_RM_TS is not set +# CONFIG_TOUCHSCREEN_SILEAD is not set +# CONFIG_TOUCHSCREEN_SIS_I2C is not set +CONFIG_TOUCHSCREEN_ST1232=m +# CONFIG_TOUCHSCREEN_STMFTS is not set +# CONFIG_TOUCHSCREEN_SUR40 is not set +# CONFIG_TOUCHSCREEN_SURFACE3_SPI is not set +# CONFIG_TOUCHSCREEN_SX8654 is not set +# CONFIG_TOUCHSCREEN_TPS6507X is not set +# CONFIG_TOUCHSCREEN_ZET6223 is not set +# CONFIG_TOUCHSCREEN_ZFORCE is not set +# CONFIG_TOUCHSCREEN_ROHM_BU21023 is not set +# CONFIG_TOUCHSCREEN_IQS5XX is not set +# CONFIG_TOUCHSCREEN_ZINITIX is not set +CONFIG_INPUT_MISC=y +# CONFIG_INPUT_AD714X is not set +# CONFIG_INPUT_ARIZONA_HAPTICS is not set +# CONFIG_INPUT_ATMEL_CAPTOUCH is not set +# CONFIG_INPUT_BMA150 is not set +# CONFIG_INPUT_E3X0_BUTTON is not set +# CONFIG_INPUT_MMA8450 is not set +# CONFIG_INPUT_GPIO_BEEPER is not set +# CONFIG_INPUT_GPIO_DECODER is not set +# CONFIG_INPUT_GPIO_VIBRA is not set +# CONFIG_INPUT_ATI_REMOTE2 is not set +# CONFIG_INPUT_KEYSPAN_REMOTE is not set +# CONFIG_INPUT_KXTJ9 is not set +# CONFIG_INPUT_POWERMATE is not set +# CONFIG_INPUT_YEALINK is not set +# CONFIG_INPUT_CM109 is not set +# CONFIG_INPUT_REGULATOR_HAPTIC is not set +CONFIG_INPUT_UINPUT=y +# CONFIG_INPUT_PCF8574 is not set +# CONFIG_INPUT_PWM_BEEPER is not set +# CONFIG_INPUT_PWM_VIBRA is not set +CONFIG_INPUT_GPIO_ROTARY_ENCODER=m +# CONFIG_INPUT_DA7280_HAPTICS is not set +# CONFIG_INPUT_ADXL34X is not set +# CONFIG_INPUT_IMS_PCU is not set +# CONFIG_INPUT_IQS269A is not set +# CONFIG_INPUT_IQS626A is not set +# CONFIG_INPUT_IQS7222 is not set +# CONFIG_INPUT_CMA3000 is not set +# CONFIG_INPUT_DRV260X_HAPTICS is not set +# CONFIG_INPUT_DRV2665_HAPTICS is not set +# CONFIG_INPUT_DRV2667_HAPTICS is not set +CONFIG_INPUT_RASPBERRYPI_BUTTON=y +CONFIG_RMI4_CORE=y +# CONFIG_RMI4_I2C is not set +# CONFIG_RMI4_SPI is not set +# CONFIG_RMI4_SMB is not set +CONFIG_RMI4_F03=y +CONFIG_RMI4_F03_SERIO=y +CONFIG_RMI4_2D_SENSOR=y +CONFIG_RMI4_F11=y +CONFIG_RMI4_F12=y +CONFIG_RMI4_F30=y +# CONFIG_RMI4_F34 is not set +# CONFIG_RMI4_F3A is not set +# CONFIG_RMI4_F55 is not set + +# +# Hardware I/O ports +# +CONFIG_SERIO=y +CONFIG_SERIO_SERPORT=y +# CONFIG_SERIO_AMBAKMI is not set +# CONFIG_SERIO_PCIPS2 is not set +# CONFIG_SERIO_LIBPS2 is not set +# CONFIG_SERIO_RAW is not set +# CONFIG_SERIO_ALTERA_PS2 is not set +# CONFIG_SERIO_PS2MULT is not set +# CONFIG_SERIO_ARC_PS2 is not set +# CONFIG_SERIO_APBPS2 is not set +# CONFIG_SERIO_GPIO_PS2 is not set +# CONFIG_USERIO is not set +# CONFIG_GAMEPORT is not set +# end of Hardware I/O ports +# end of Input device support + +# +# Character devices +# +CONFIG_BRCM_CHAR_DRIVERS=y +# CONFIG_BCM2708_VCMEM is not set +CONFIG_BCM_VCIO=y +CONFIG_BCM2835_SMI_DEV=m +# CONFIG_RPIVID_MEM is not set +CONFIG_TTY=y +CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y +CONFIG_VT_CONSOLE=y +CONFIG_HW_CONSOLE=y +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_UNIX98_PTYS=y +# CONFIG_LEGACY_PTYS is not set +CONFIG_LDISC_AUTOLOAD=y + +# +# Serial drivers +# +CONFIG_SERIAL_EARLYCON=y +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set +CONFIG_SERIAL_8250_16550A_VARIANTS=y +# CONFIG_SERIAL_8250_FINTEK is not set +CONFIG_SERIAL_8250_CONSOLE=y +# CONFIG_SERIAL_8250_DMA is not set +CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_EXAR=y +CONFIG_SERIAL_8250_NR_UARTS=1 +CONFIG_SERIAL_8250_RUNTIME_UARTS=0 +CONFIG_SERIAL_8250_EXTENDED=y +# CONFIG_SERIAL_8250_MANY_PORTS is not set +CONFIG_SERIAL_8250_SHARE_IRQ=y +# CONFIG_SERIAL_8250_DETECT_IRQ is not set +# CONFIG_SERIAL_8250_RSA is not set +CONFIG_SERIAL_8250_BCM2835AUX=y +CONFIG_SERIAL_8250_FSL=y +# CONFIG_SERIAL_8250_DW is not set +# CONFIG_SERIAL_8250_RT288X is not set +CONFIG_SERIAL_8250_PERICOM=y +CONFIG_SERIAL_8250_BCM7271=y +CONFIG_SERIAL_OF_PLATFORM=y + +# +# Non-8250 serial port support +# +# CONFIG_SERIAL_AMBA_PL010 is not set +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +# CONFIG_SERIAL_EARLYCON_ARM_SEMIHOST is not set +# CONFIG_SERIAL_MAX3100 is not set +# CONFIG_SERIAL_MAX310X is not set +# CONFIG_SERIAL_UARTLITE is not set +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +# CONFIG_SERIAL_JSM is not set +# CONFIG_SERIAL_SIFIVE is not set +# CONFIG_SERIAL_SCCNXP is not set +# CONFIG_SERIAL_SC16IS7XX is not set +# CONFIG_SERIAL_ALTERA_JTAGUART is not set +# CONFIG_SERIAL_ALTERA_UART is not set +# CONFIG_SERIAL_XILINX_PS_UART is not set +# CONFIG_SERIAL_ARC is not set +# CONFIG_SERIAL_RP2 is not set +# CONFIG_SERIAL_FSL_LPUART is not set +# CONFIG_SERIAL_FSL_LINFLEXUART is not set +# CONFIG_SERIAL_CONEXANT_DIGICOLOR is not set +# CONFIG_SERIAL_SPRD is not set +# end of Serial drivers + +CONFIG_SERIAL_MCTRL_GPIO=y +# CONFIG_SERIAL_NONSTANDARD is not set +# CONFIG_N_GSM is not set +# CONFIG_NOZOMI is not set +# CONFIG_NULL_TTY is not set +# CONFIG_HVC_DCC is not set +CONFIG_SERIAL_DEV_BUS=y +CONFIG_SERIAL_DEV_CTRL_TTYPORT=y +# CONFIG_TTY_PRINTK is not set +# CONFIG_VIRTIO_CONSOLE is not set +# CONFIG_IPMI_HANDLER is not set +CONFIG_HW_RANDOM=y +# CONFIG_HW_RANDOM_TIMERIOMEM is not set +# CONFIG_HW_RANDOM_BA431 is not set +CONFIG_HW_RANDOM_BCM2835=y +CONFIG_HW_RANDOM_IPROC_RNG200=y +# CONFIG_HW_RANDOM_CCTRNG is not set +# CONFIG_HW_RANDOM_XIPHERA is not set +CONFIG_HW_RANDOM_ARM_SMCCC_TRNG=y +CONFIG_HW_RANDOM_CN10K=y +# CONFIG_APPLICOM is not set +CONFIG_DEVMEM=y +CONFIG_DEVPORT=y +# CONFIG_TCG_TPM is not set +# CONFIG_XILLYBUS is not set +# CONFIG_XILLYUSB is not set +CONFIG_RANDOM_TRUST_CPU=y +CONFIG_RANDOM_TRUST_BOOTLOADER=y +CONFIG_RASPBERRYPI_GPIOMEM=y +# end of Character devices + +# +# I2C support +# +CONFIG_I2C=y +CONFIG_I2C_BOARDINFO=y +# CONFIG_I2C_COMPAT is not set +CONFIG_I2C_CHARDEV=y +CONFIG_I2C_MUX=y + +# +# Multiplexer I2C Chip support +# +# CONFIG_I2C_ARB_GPIO_CHALLENGE is not set +# CONFIG_I2C_MUX_GPIO is not set +# CONFIG_I2C_MUX_GPMUX is not set +# CONFIG_I2C_MUX_LTC4306 is not set +# CONFIG_I2C_MUX_PCA9541 is not set +# CONFIG_I2C_MUX_PCA954x is not set +CONFIG_I2C_MUX_PINCTRL=y +# CONFIG_I2C_MUX_REG is not set +# CONFIG_I2C_DEMUX_PINCTRL is not set +# CONFIG_I2C_MUX_MLXCPLD is not set +# end of Multiplexer I2C Chip support + +CONFIG_I2C_HELPER_AUTO=y +CONFIG_I2C_ALGOBIT=y + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +CONFIG_I2C_BCM2708=y +CONFIG_I2C_BCM2708_BAUDRATE=100000 +# CONFIG_I2C_ALI1535 is not set +# CONFIG_I2C_ALI1563 is not set +# CONFIG_I2C_ALI15X3 is not set +# CONFIG_I2C_AMD756 is not set +# CONFIG_I2C_AMD8111 is not set +# CONFIG_I2C_I801 is not set +# CONFIG_I2C_ISCH is not set +# CONFIG_I2C_PIIX4 is not set +# CONFIG_I2C_NFORCE2 is not set +# CONFIG_I2C_NVIDIA_GPU is not set +# CONFIG_I2C_SIS5595 is not set +# CONFIG_I2C_SIS630 is not set +# CONFIG_I2C_SIS96X is not set +# CONFIG_I2C_VIA is not set +# CONFIG_I2C_VIAPRO is not set + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_BCM2835=y +CONFIG_I2C_BRCMSTB=y +# CONFIG_I2C_CADENCE is not set +# CONFIG_I2C_CBUS_GPIO is not set +CONFIG_I2C_DESIGNWARE_CORE=y +# CONFIG_I2C_DESIGNWARE_SLAVE is not set +CONFIG_I2C_DESIGNWARE_PLATFORM=y +# CONFIG_I2C_DESIGNWARE_PCI is not set +# CONFIG_I2C_EMEV2 is not set +CONFIG_I2C_GPIO=y +# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set +# CONFIG_I2C_NOMADIK is not set +# CONFIG_I2C_OCORES is not set +# CONFIG_I2C_PCA_PLATFORM is not set +# CONFIG_I2C_RK3X is not set +# CONFIG_I2C_SIMTEC is not set +# CONFIG_I2C_THUNDERX is not set +# CONFIG_I2C_XILINX is not set + +# +# External I2C/SMBus adapter drivers +# +# CONFIG_I2C_DIOLAN_U2C is not set +# CONFIG_I2C_CP2615 is not set +# CONFIG_I2C_PCI1XXXX is not set +# CONFIG_I2C_ROBOTFUZZ_OSIF is not set +# CONFIG_I2C_TAOS_EVM is not set +# CONFIG_I2C_TINY_USB is not set + +# +# Other I2C/SMBus bus drivers +# +# CONFIG_I2C_VIRTIO is not set +# end of I2C Hardware Bus support + +# CONFIG_I2C_STUB is not set +# CONFIG_I2C_SLAVE is not set +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# end of I2C support + +# CONFIG_I3C is not set +CONFIG_SPI=y +# CONFIG_SPI_DEBUG is not set +CONFIG_SPI_MASTER=y +# CONFIG_SPI_MEM is not set + +# +# SPI Master Controller Drivers +# +# CONFIG_SPI_ALTERA is not set +# CONFIG_SPI_AXI_SPI_ENGINE is not set +CONFIG_SPI_BCM2835=m +CONFIG_SPI_BCM2835AUX=m +# CONFIG_SPI_BCM_QSPI is not set +# CONFIG_SPI_BITBANG is not set +# CONFIG_SPI_CADENCE is not set +# CONFIG_SPI_CADENCE_QUADSPI is not set +CONFIG_SPI_DESIGNWARE=m +CONFIG_SPI_DW_DMA=y +# CONFIG_SPI_DW_PCI is not set +CONFIG_SPI_DW_MMIO=m +# CONFIG_SPI_NXP_FLEXSPI is not set +# CONFIG_SPI_GPIO is not set +# CONFIG_SPI_FSL_SPI is not set +# CONFIG_SPI_MICROCHIP_CORE is not set +# CONFIG_SPI_MICROCHIP_CORE_QSPI is not set +# CONFIG_SPI_OC_TINY is not set +# CONFIG_SPI_PL022 is not set +# CONFIG_SPI_PXA2XX is not set +# CONFIG_SPI_ROCKCHIP is not set +# CONFIG_SPI_SC18IS602 is not set +# CONFIG_SPI_SIFIVE is not set +# CONFIG_SPI_MXIC is not set +# CONFIG_SPI_THUNDERX is not set +# CONFIG_SPI_XCOMM is not set +# CONFIG_SPI_XILINX is not set +# CONFIG_SPI_ZYNQMP_GQSPI is not set +# CONFIG_SPI_AMD is not set + +# +# SPI Multiplexer support +# +# CONFIG_SPI_MUX is not set + +# +# SPI Protocol Masters +# +CONFIG_SPI_SPIDEV=y +# CONFIG_SPI_LOOPBACK_TEST is not set +# CONFIG_SPI_TLE62X0 is not set +# CONFIG_SPI_SLAVE is not set +CONFIG_SPI_DYNAMIC=y +# CONFIG_SPMI is not set +# CONFIG_HSI is not set +# CONFIG_PPS is not set + +# +# PTP clock support +# +# CONFIG_PTP_1588_CLOCK is not set +CONFIG_PTP_1588_CLOCK_OPTIONAL=y + +# +# Enable PHYLIB and NETWORK_PHY_TIMESTAMPING to see the additional clocks. +# +# end of PTP clock support + +CONFIG_PINCTRL=y +CONFIG_PINMUX=y +CONFIG_PINCONF=y +CONFIG_GENERIC_PINCONF=y +# CONFIG_DEBUG_PINCTRL is not set +# CONFIG_PINCTRL_CY8C95X0 is not set +# CONFIG_PINCTRL_MCP23S08 is not set +# CONFIG_PINCTRL_MICROCHIP_SGPIO is not set +# CONFIG_PINCTRL_OCELOT is not set +# CONFIG_PINCTRL_SINGLE is not set +# CONFIG_PINCTRL_STMFX is not set +# CONFIG_PINCTRL_SX150X is not set +CONFIG_PINCTRL_RP1=y +CONFIG_PINCTRL_BCM2712=y +CONFIG_PINCTRL_BCM2835=y + +# +# Renesas pinctrl drivers +# +# end of Renesas pinctrl drivers + +CONFIG_GPIOLIB=y +CONFIG_GPIOLIB_FASTPATH_LIMIT=512 +CONFIG_OF_GPIO=y +CONFIG_GPIOLIB_IRQCHIP=y +# CONFIG_DEBUG_GPIO is not set +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_CDEV=y +CONFIG_GPIO_CDEV_V1=y +CONFIG_GPIO_GENERIC=y + +# +# Memory mapped GPIO drivers +# +# CONFIG_GPIO_74XX_MMIO is not set +# CONFIG_GPIO_ALTERA is not set +CONFIG_GPIO_RASPBERRYPI_EXP=y +CONFIG_GPIO_BCM_VIRT=y +CONFIG_GPIO_BRCMSTB=y +# CONFIG_GPIO_CADENCE is not set +# CONFIG_GPIO_DWAPB is not set +# CONFIG_GPIO_EXAR is not set +# CONFIG_GPIO_FTGPIO010 is not set +# CONFIG_GPIO_GENERIC_PLATFORM is not set +# CONFIG_GPIO_GRGPIO is not set +# CONFIG_GPIO_HLWD is not set +# CONFIG_GPIO_LOGICVC is not set +# CONFIG_GPIO_MB86S7X is not set +# CONFIG_GPIO_PL061 is not set +# CONFIG_GPIO_PWM is not set +# CONFIG_GPIO_SIFIVE is not set +# CONFIG_GPIO_SYSCON is not set +# CONFIG_GPIO_XGENE is not set +# CONFIG_GPIO_XILINX is not set +# CONFIG_GPIO_AMD_FCH is not set +# end of Memory mapped GPIO drivers + +# +# I2C GPIO expanders +# +# CONFIG_GPIO_ADNP is not set +# CONFIG_GPIO_GW_PLD is not set +# CONFIG_GPIO_MAX7300 is not set +# CONFIG_GPIO_MAX732X is not set +# CONFIG_GPIO_PCA953X is not set +# CONFIG_GPIO_PCA9570 is not set +# CONFIG_GPIO_PCF857X is not set +# CONFIG_GPIO_TPIC2810 is not set +# end of I2C GPIO expanders + +# +# MFD GPIO expanders +# +CONFIG_GPIO_ARIZONA=m +CONFIG_GPIO_FSM=y +# end of MFD GPIO expanders + +# +# PCI GPIO expanders +# +# CONFIG_GPIO_BT8XX is not set +# CONFIG_GPIO_PCI_IDIO_16 is not set +# CONFIG_GPIO_PCIE_IDIO_24 is not set +# CONFIG_GPIO_RDC321X is not set +# end of PCI GPIO expanders + +# +# SPI GPIO expanders +# +# CONFIG_GPIO_74X164 is not set +# CONFIG_GPIO_MAX3191X is not set +# CONFIG_GPIO_MAX7301 is not set +# CONFIG_GPIO_MC33880 is not set +# CONFIG_GPIO_PISOSR is not set +# CONFIG_GPIO_XRA1403 is not set +# end of SPI GPIO expanders + +# +# USB GPIO expanders +# +# end of USB GPIO expanders + +# +# Virtual GPIO drivers +# +# CONFIG_GPIO_AGGREGATOR is not set +# CONFIG_GPIO_MOCKUP is not set +# CONFIG_GPIO_SIM is not set +# end of Virtual GPIO drivers + +CONFIG_W1=m + +# +# 1-wire Bus Masters +# +# CONFIG_W1_MASTER_MATROX is not set +# CONFIG_W1_MASTER_DS2490 is not set +# CONFIG_W1_MASTER_DS2482 is not set +# CONFIG_W1_MASTER_DS1WM is not set +CONFIG_W1_MASTER_GPIO=m +# CONFIG_W1_MASTER_SGI is not set +# end of 1-wire Bus Masters + +# +# 1-wire Slaves +# +CONFIG_W1_SLAVE_THERM=m +# CONFIG_W1_SLAVE_SMEM is not set +# CONFIG_W1_SLAVE_DS2405 is not set +# CONFIG_W1_SLAVE_DS2408 is not set +# CONFIG_W1_SLAVE_DS2413 is not set +# CONFIG_W1_SLAVE_DS2406 is not set +# CONFIG_W1_SLAVE_DS2423 is not set +# CONFIG_W1_SLAVE_DS2805 is not set +# CONFIG_W1_SLAVE_DS2430 is not set +# CONFIG_W1_SLAVE_DS2431 is not set +# CONFIG_W1_SLAVE_DS2433 is not set +# CONFIG_W1_SLAVE_DS2438 is not set +# CONFIG_W1_SLAVE_DS250X is not set +# CONFIG_W1_SLAVE_DS2780 is not set +# CONFIG_W1_SLAVE_DS2781 is not set +# CONFIG_W1_SLAVE_DS28E04 is not set +# CONFIG_W1_SLAVE_DS28E17 is not set +# end of 1-wire Slaves + +CONFIG_POWER_RESET=y +# CONFIG_POWER_RESET_BRCMSTB is not set +CONFIG_POWER_RESET_GPIO=y +CONFIG_POWER_RESET_GPIO_RESTART=y +# CONFIG_POWER_RESET_LTC2952 is not set +# CONFIG_POWER_RESET_REGULATOR is not set +CONFIG_POWER_RESET_RESTART=y +# CONFIG_POWER_RESET_XGENE is not set +# CONFIG_POWER_RESET_SYSCON is not set +# CONFIG_POWER_RESET_SYSCON_POWEROFF is not set +# CONFIG_SYSCON_REBOOT_MODE is not set +# CONFIG_NVMEM_REBOOT_MODE is not set +CONFIG_POWER_SUPPLY=y +# CONFIG_POWER_SUPPLY_DEBUG is not set +CONFIG_POWER_SUPPLY_HWMON=y +CONFIG_RPI_POE_POWER=m +# CONFIG_PDA_POWER is not set +# CONFIG_IP5XXX_POWER is not set +# CONFIG_TEST_POWER is not set +# CONFIG_CHARGER_ADP5061 is not set +# CONFIG_BATTERY_CW2015 is not set +# CONFIG_BATTERY_DS2760 is not set +# CONFIG_BATTERY_DS2780 is not set +# CONFIG_BATTERY_DS2781 is not set +# CONFIG_BATTERY_DS2782 is not set +# CONFIG_BATTERY_SAMSUNG_SDI is not set +# CONFIG_BATTERY_SBS is not set +# CONFIG_CHARGER_SBS is not set +# CONFIG_MANAGER_SBS is not set +# CONFIG_BATTERY_BQ27XXX is not set +# CONFIG_BATTERY_MAX17040 is not set +# CONFIG_BATTERY_MAX17042 is not set +# CONFIG_BATTERY_MAX1721X is not set +# CONFIG_CHARGER_ISP1704 is not set +# CONFIG_CHARGER_MAX8903 is not set +# CONFIG_CHARGER_LP8727 is not set +# CONFIG_CHARGER_GPIO is not set +# CONFIG_CHARGER_MANAGER is not set +# CONFIG_CHARGER_LT3651 is not set +# CONFIG_CHARGER_LTC4162L is not set +# CONFIG_CHARGER_DETECTOR_MAX14656 is not set +# CONFIG_CHARGER_MAX77976 is not set +# CONFIG_CHARGER_BQ2415X is not set +# CONFIG_CHARGER_BQ24190 is not set +# CONFIG_CHARGER_BQ24257 is not set +# CONFIG_CHARGER_BQ24735 is not set +# CONFIG_CHARGER_BQ2515X is not set +# CONFIG_CHARGER_BQ25890 is not set +# CONFIG_CHARGER_BQ25980 is not set +# CONFIG_CHARGER_BQ256XX is not set +# CONFIG_CHARGER_SMB347 is not set +# CONFIG_BATTERY_GAUGE_LTC2941 is not set +# CONFIG_BATTERY_GOLDFISH is not set +# CONFIG_BATTERY_RT5033 is not set +# CONFIG_CHARGER_RT9455 is not set +# CONFIG_CHARGER_UCS1002 is not set +# CONFIG_CHARGER_BD99954 is not set +# CONFIG_BATTERY_UG3105 is not set +CONFIG_HWMON=y +# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# Native drivers +# +# CONFIG_SENSORS_AD7314 is not set +# CONFIG_SENSORS_AD7414 is not set +# CONFIG_SENSORS_AD7418 is not set +# CONFIG_SENSORS_ADM1021 is not set +# CONFIG_SENSORS_ADM1025 is not set +# CONFIG_SENSORS_ADM1026 is not set +# CONFIG_SENSORS_ADM1029 is not set +# CONFIG_SENSORS_ADM1031 is not set +# CONFIG_SENSORS_ADM1177 is not set +# CONFIG_SENSORS_ADM9240 is not set +# CONFIG_SENSORS_ADT7310 is not set +# CONFIG_SENSORS_ADT7410 is not set +# CONFIG_SENSORS_ADT7411 is not set +# CONFIG_SENSORS_ADT7462 is not set +# CONFIG_SENSORS_ADT7470 is not set +# CONFIG_SENSORS_ADT7475 is not set +# CONFIG_SENSORS_AHT10 is not set +# CONFIG_SENSORS_AQUACOMPUTER_D5NEXT is not set +# CONFIG_SENSORS_AS370 is not set +# CONFIG_SENSORS_ASC7621 is not set +# CONFIG_SENSORS_AXI_FAN_CONTROL is not set +# CONFIG_SENSORS_ATXP1 is not set +# CONFIG_SENSORS_CORSAIR_CPRO is not set +# CONFIG_SENSORS_CORSAIR_PSU is not set +# CONFIG_SENSORS_DS620 is not set +# CONFIG_SENSORS_DS1621 is not set +# CONFIG_SENSORS_I5K_AMB is not set +# CONFIG_SENSORS_F71805F is not set +# CONFIG_SENSORS_F71882FG is not set +# CONFIG_SENSORS_F75375S is not set +# CONFIG_SENSORS_FTSTEUTATES is not set +# CONFIG_SENSORS_GL518SM is not set +# CONFIG_SENSORS_GL520SM is not set +# CONFIG_SENSORS_G760A is not set +# CONFIG_SENSORS_G762 is not set +CONFIG_SENSORS_GPIO_FAN=m +# CONFIG_SENSORS_HIH6130 is not set +# CONFIG_SENSORS_IT87 is not set +# CONFIG_SENSORS_JC42 is not set +# CONFIG_SENSORS_POWR1220 is not set +# CONFIG_SENSORS_LINEAGE is not set +# CONFIG_SENSORS_LTC2945 is not set +# CONFIG_SENSORS_LTC2947_I2C is not set +# CONFIG_SENSORS_LTC2947_SPI is not set +# CONFIG_SENSORS_LTC2990 is not set +# CONFIG_SENSORS_LTC2992 is not set +# CONFIG_SENSORS_LTC4151 is not set +# CONFIG_SENSORS_LTC4215 is not set +# CONFIG_SENSORS_LTC4222 is not set +# CONFIG_SENSORS_LTC4245 is not set +# CONFIG_SENSORS_LTC4260 is not set +# CONFIG_SENSORS_LTC4261 is not set +# CONFIG_SENSORS_MAX1111 is not set +# CONFIG_SENSORS_MAX127 is not set +# CONFIG_SENSORS_MAX16065 is not set +# CONFIG_SENSORS_MAX1619 is not set +# CONFIG_SENSORS_MAX1668 is not set +# CONFIG_SENSORS_MAX197 is not set +# CONFIG_SENSORS_MAX31722 is not set +# CONFIG_SENSORS_MAX31730 is not set +# CONFIG_SENSORS_MAX31760 is not set +# CONFIG_SENSORS_MAX6620 is not set +# CONFIG_SENSORS_MAX6621 is not set +# CONFIG_SENSORS_MAX6639 is not set +# CONFIG_SENSORS_MAX6642 is not set +# CONFIG_SENSORS_MAX6650 is not set +# CONFIG_SENSORS_MAX6697 is not set +# CONFIG_SENSORS_MAX31790 is not set +# CONFIG_SENSORS_MCP3021 is not set +# CONFIG_SENSORS_TC654 is not set +# CONFIG_SENSORS_TPS23861 is not set +# CONFIG_SENSORS_MR75203 is not set +# CONFIG_SENSORS_ADCXX is not set +# CONFIG_SENSORS_LM63 is not set +# CONFIG_SENSORS_LM70 is not set +# CONFIG_SENSORS_LM73 is not set +# CONFIG_SENSORS_LM75 is not set +# CONFIG_SENSORS_LM77 is not set +# CONFIG_SENSORS_LM78 is not set +# CONFIG_SENSORS_LM80 is not set +# CONFIG_SENSORS_LM83 is not set +# CONFIG_SENSORS_LM85 is not set +# CONFIG_SENSORS_LM87 is not set +# CONFIG_SENSORS_LM90 is not set +# CONFIG_SENSORS_LM92 is not set +# CONFIG_SENSORS_LM93 is not set +# CONFIG_SENSORS_LM95234 is not set +# CONFIG_SENSORS_LM95241 is not set +# CONFIG_SENSORS_LM95245 is not set +# CONFIG_SENSORS_PC87360 is not set +# CONFIG_SENSORS_PC87427 is not set +# CONFIG_SENSORS_NCT6683 is not set +# CONFIG_SENSORS_NCT6775 is not set +# CONFIG_SENSORS_NCT6775_I2C is not set +# CONFIG_SENSORS_NCT7802 is not set +# CONFIG_SENSORS_NCT7904 is not set +# CONFIG_SENSORS_NPCM7XX is not set +# CONFIG_SENSORS_NZXT_KRAKEN2 is not set +# CONFIG_SENSORS_NZXT_SMART2 is not set +# CONFIG_SENSORS_OCC_P8_I2C is not set +# CONFIG_SENSORS_PCF8591 is not set +# CONFIG_PMBUS is not set +CONFIG_SENSORS_PWM_FAN=m +CONFIG_SENSORS_RASPBERRYPI_HWMON=y +# CONFIG_SENSORS_SBTSI is not set +# CONFIG_SENSORS_SBRMI is not set +# CONFIG_SENSORS_SHT15 is not set +# CONFIG_SENSORS_SHT21 is not set +# CONFIG_SENSORS_SHT3x is not set +# CONFIG_SENSORS_SHT4x is not set +# CONFIG_SENSORS_SHTC1 is not set +# CONFIG_SENSORS_SIS5595 is not set +# CONFIG_SENSORS_DME1737 is not set +# CONFIG_SENSORS_EMC1403 is not set +# CONFIG_SENSORS_EMC2103 is not set +# CONFIG_SENSORS_EMC2305 is not set +# CONFIG_SENSORS_EMC6W201 is not set +# CONFIG_SENSORS_SMSC47M1 is not set +# CONFIG_SENSORS_SMSC47M192 is not set +# CONFIG_SENSORS_SMSC47B397 is not set +# CONFIG_SENSORS_SCH5627 is not set +# CONFIG_SENSORS_SCH5636 is not set +# CONFIG_SENSORS_STTS751 is not set +# CONFIG_SENSORS_SMM665 is not set +# CONFIG_SENSORS_ADC128D818 is not set +# CONFIG_SENSORS_ADS7828 is not set +# CONFIG_SENSORS_ADS7871 is not set +# CONFIG_SENSORS_AMC6821 is not set +# CONFIG_SENSORS_INA209 is not set +# CONFIG_SENSORS_INA2XX is not set +# CONFIG_SENSORS_INA238 is not set +# CONFIG_SENSORS_INA3221 is not set +# CONFIG_SENSORS_TC74 is not set +# CONFIG_SENSORS_THMC50 is not set +# CONFIG_SENSORS_TMP102 is not set +# CONFIG_SENSORS_TMP103 is not set +# CONFIG_SENSORS_TMP108 is not set +# CONFIG_SENSORS_TMP401 is not set +# CONFIG_SENSORS_TMP421 is not set +# CONFIG_SENSORS_TMP464 is not set +# CONFIG_SENSORS_TMP513 is not set +# CONFIG_SENSORS_VIA686A is not set +# CONFIG_SENSORS_VT1211 is not set +# CONFIG_SENSORS_VT8231 is not set +# CONFIG_SENSORS_W83773G is not set +# CONFIG_SENSORS_W83781D is not set +# CONFIG_SENSORS_W83791D is not set +# CONFIG_SENSORS_W83792D is not set +# CONFIG_SENSORS_W83793 is not set +# CONFIG_SENSORS_W83795 is not set +# CONFIG_SENSORS_W83L785TS is not set +# CONFIG_SENSORS_W83L786NG is not set +# CONFIG_SENSORS_W83627HF is not set +# CONFIG_SENSORS_W83627EHF is not set +CONFIG_SENSORS_RP1_ADC=m +CONFIG_THERMAL=y +# CONFIG_THERMAL_NETLINK is not set +# CONFIG_THERMAL_STATISTICS is not set +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=0 +CONFIG_THERMAL_HWMON=y +CONFIG_THERMAL_OF=y +# CONFIG_THERMAL_WRITABLE_TRIPS is not set +CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y +# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set +# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set +# CONFIG_THERMAL_GOV_FAIR_SHARE is not set +CONFIG_THERMAL_GOV_STEP_WISE=y +# CONFIG_THERMAL_GOV_BANG_BANG is not set +# CONFIG_THERMAL_GOV_USER_SPACE is not set +# CONFIG_CPU_THERMAL is not set +# CONFIG_THERMAL_EMULATION is not set +# CONFIG_THERMAL_MMIO is not set + +# +# Broadcom thermal drivers +# +CONFIG_BCM2711_THERMAL=y +CONFIG_BCM2835_THERMAL=y +# CONFIG_BRCMSTB_THERMAL is not set +# end of Broadcom thermal drivers + +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_CORE=y +# CONFIG_WATCHDOG_NOWAYOUT is not set +CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y +CONFIG_WATCHDOG_OPEN_TIMEOUT=0 +# CONFIG_WATCHDOG_SYSFS is not set +# CONFIG_WATCHDOG_HRTIMER_PRETIMEOUT is not set + +# +# Watchdog Pretimeout Governors +# +# CONFIG_WATCHDOG_PRETIMEOUT_GOV is not set + +# +# Watchdog Device Drivers +# +# CONFIG_SOFT_WATCHDOG is not set +# CONFIG_GPIO_WATCHDOG is not set +# CONFIG_XILINX_WATCHDOG is not set +# CONFIG_ZIIRAVE_WATCHDOG is not set +# CONFIG_ARM_SP805_WATCHDOG is not set +# CONFIG_ARM_SBSA_WATCHDOG is not set +# CONFIG_CADENCE_WATCHDOG is not set +# CONFIG_DW_WATCHDOG is not set +# CONFIG_MAX63XX_WATCHDOG is not set +# CONFIG_ARM_SMC_WATCHDOG is not set +# CONFIG_ALIM7101_WDT is not set +# CONFIG_I6300ESB_WDT is not set +# CONFIG_HP_WATCHDOG is not set +CONFIG_BCM2835_WDT=y +# CONFIG_BCM7038_WDT is not set +# CONFIG_MEN_A21_WDT is not set + +# +# PCI-based Watchdog Cards +# +# CONFIG_PCIPCWATCHDOG is not set +# CONFIG_WDTPCI is not set + +# +# USB-based Watchdog Cards +# +# CONFIG_USBPCWATCHDOG is not set +CONFIG_SSB_POSSIBLE=y +CONFIG_SSB=m +CONFIG_SSB_SPROM=y +CONFIG_SSB_BLOCKIO=y +CONFIG_SSB_PCIHOST_POSSIBLE=y +CONFIG_SSB_PCIHOST=y +CONFIG_SSB_B43_PCI_BRIDGE=y +CONFIG_SSB_SDIOHOST_POSSIBLE=y +# CONFIG_SSB_SDIOHOST is not set +CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y +CONFIG_SSB_DRIVER_PCICORE=y +# CONFIG_SSB_DRIVER_GPIO is not set +CONFIG_BCMA_POSSIBLE=y +CONFIG_BCMA=m +CONFIG_BCMA_BLOCKIO=y +CONFIG_BCMA_HOST_PCI_POSSIBLE=y +CONFIG_BCMA_HOST_PCI=y +# CONFIG_BCMA_HOST_SOC is not set +CONFIG_BCMA_DRIVER_PCI=y +CONFIG_BCMA_DRIVER_GMAC_CMN=y +# CONFIG_BCMA_DRIVER_GPIO is not set +# CONFIG_BCMA_DEBUG is not set + +# +# Multifunction device drivers +# +CONFIG_MFD_CORE=y +# CONFIG_MFD_RPISENSE_CORE is not set +# CONFIG_MFD_ACT8945A is not set +# CONFIG_MFD_AS3711 is not set +# CONFIG_MFD_AS3722 is not set +# CONFIG_PMIC_ADP5520 is not set +# CONFIG_MFD_AAT2870_CORE is not set +# CONFIG_MFD_ATMEL_FLEXCOM is not set +# CONFIG_MFD_ATMEL_HLCDC is not set +# CONFIG_MFD_BCM590XX is not set +# CONFIG_MFD_BD9571MWV is not set +# CONFIG_MFD_AXP20X_I2C is not set +# CONFIG_MFD_MADERA is not set +# CONFIG_PMIC_DA903X is not set +# CONFIG_MFD_DA9052_SPI is not set +# CONFIG_MFD_DA9052_I2C is not set +# CONFIG_MFD_DA9055 is not set +# CONFIG_MFD_DA9062 is not set +# CONFIG_MFD_DA9063 is not set +# CONFIG_MFD_DA9150 is not set +# CONFIG_MFD_DLN2 is not set +# CONFIG_MFD_GATEWORKS_GSC is not set +# CONFIG_MFD_MC13XXX_SPI is not set +# CONFIG_MFD_MC13XXX_I2C is not set +# CONFIG_MFD_MP2629 is not set +# CONFIG_MFD_HI6421_PMIC is not set +# CONFIG_HTC_PASIC3 is not set +# CONFIG_HTC_I2CPLD is not set +# CONFIG_LPC_ICH is not set +# CONFIG_LPC_SCH is not set +# CONFIG_MFD_IQS62X is not set +# CONFIG_MFD_JANZ_CMODIO is not set +# CONFIG_MFD_KEMPLD is not set +# CONFIG_MFD_88PM800 is not set +# CONFIG_MFD_88PM805 is not set +# CONFIG_MFD_88PM860X is not set +# CONFIG_MFD_MAX14577 is not set +# CONFIG_MFD_MAX77620 is not set +# CONFIG_MFD_MAX77650 is not set +# CONFIG_MFD_MAX77686 is not set +# CONFIG_MFD_MAX77693 is not set +# CONFIG_MFD_MAX77714 is not set +# CONFIG_MFD_MAX77843 is not set +# CONFIG_MFD_MAX8907 is not set +# CONFIG_MFD_MAX8925 is not set +# CONFIG_MFD_MAX8997 is not set +# CONFIG_MFD_MAX8998 is not set +# CONFIG_MFD_MT6360 is not set +# CONFIG_MFD_MT6370 is not set +# CONFIG_MFD_MT6397 is not set +# CONFIG_MFD_MENF21BMC is not set +# CONFIG_MFD_OCELOT is not set +# CONFIG_EZX_PCAP is not set +# CONFIG_MFD_CPCAP is not set +# CONFIG_MFD_VIPERBOARD is not set +# CONFIG_MFD_NTXEC is not set +# CONFIG_MFD_RETU is not set +# CONFIG_MFD_PCF50633 is not set +# CONFIG_MFD_SY7636A is not set +CONFIG_MFD_RASPBERRYPI_POE_HAT=m +# CONFIG_MFD_RDC321X is not set +# CONFIG_MFD_RT4831 is not set +# CONFIG_MFD_RT5033 is not set +# CONFIG_MFD_RT5120 is not set +# CONFIG_MFD_RC5T583 is not set +# CONFIG_MFD_RK808 is not set +# CONFIG_MFD_RN5T618 is not set +# CONFIG_MFD_SEC_CORE is not set +# CONFIG_MFD_SI476X_CORE is not set +CONFIG_MFD_SIMPLE_MFD_I2C=m +# CONFIG_MFD_SM501 is not set +# CONFIG_MFD_SKY81452 is not set +# CONFIG_MFD_STMPE is not set +CONFIG_MFD_SYSCON=y +# CONFIG_MFD_TI_AM335X_TSCADC is not set +# CONFIG_MFD_LP3943 is not set +# CONFIG_MFD_LP8788 is not set +# CONFIG_MFD_TI_LMU is not set +# CONFIG_MFD_PALMAS is not set +# CONFIG_TPS6105X is not set +# CONFIG_TPS65010 is not set +# CONFIG_TPS6507X is not set +# CONFIG_MFD_TPS65086 is not set +# CONFIG_MFD_TPS65090 is not set +# CONFIG_MFD_TPS65217 is not set +# CONFIG_MFD_TI_LP873X is not set +# CONFIG_MFD_TI_LP87565 is not set +# CONFIG_MFD_TPS65218 is not set +# CONFIG_MFD_TPS6586X is not set +# CONFIG_MFD_TPS65910 is not set +# CONFIG_MFD_TPS65912_I2C is not set +# CONFIG_MFD_TPS65912_SPI is not set +# CONFIG_TWL4030_CORE is not set +# CONFIG_TWL6040_CORE is not set +# CONFIG_MFD_WL1273_CORE is not set +# CONFIG_MFD_LM3533 is not set +# CONFIG_MFD_TC3589X is not set +# CONFIG_MFD_TQMX86 is not set +# CONFIG_MFD_VX855 is not set +# CONFIG_MFD_LOCHNAGAR is not set +CONFIG_MFD_ARIZONA=m +CONFIG_MFD_ARIZONA_I2C=m +CONFIG_MFD_ARIZONA_SPI=m +# CONFIG_MFD_CS47L24 is not set +CONFIG_MFD_WM5102=y +# CONFIG_MFD_WM5110 is not set +# CONFIG_MFD_WM8997 is not set +# CONFIG_MFD_WM8998 is not set +# CONFIG_MFD_WM8400 is not set +# CONFIG_MFD_WM831X_I2C is not set +# CONFIG_MFD_WM831X_SPI is not set +# CONFIG_MFD_WM8350_I2C is not set +# CONFIG_MFD_WM8994 is not set +# CONFIG_MFD_ROHM_BD718XX is not set +# CONFIG_MFD_ROHM_BD71828 is not set +# CONFIG_MFD_ROHM_BD957XMUF is not set +# CONFIG_MFD_STPMIC1 is not set +# CONFIG_MFD_STMFX is not set +# CONFIG_MFD_ATC260X_I2C is not set +# CONFIG_MFD_QCOM_PM8008 is not set +# CONFIG_RAVE_SP_CORE is not set +# CONFIG_MFD_INTEL_M10_BMC is not set +CONFIG_MFD_RP1=y +# CONFIG_MFD_RSMU_I2C is not set +# CONFIG_MFD_RSMU_SPI is not set +# end of Multifunction device drivers + +CONFIG_REGULATOR=y +# CONFIG_REGULATOR_DEBUG is not set +CONFIG_REGULATOR_FIXED_VOLTAGE=y +# CONFIG_REGULATOR_VIRTUAL_CONSUMER is not set +# CONFIG_REGULATOR_USERSPACE_CONSUMER is not set +# CONFIG_REGULATOR_88PG86X is not set +# CONFIG_REGULATOR_ACT8865 is not set +# CONFIG_REGULATOR_AD5398 is not set +CONFIG_REGULATOR_ARIZONA_LDO1=m +CONFIG_REGULATOR_ARIZONA_MICSUPP=m +# CONFIG_REGULATOR_DA9121 is not set +# CONFIG_REGULATOR_DA9210 is not set +# CONFIG_REGULATOR_DA9211 is not set +# CONFIG_REGULATOR_FAN53555 is not set +# CONFIG_REGULATOR_FAN53880 is not set +CONFIG_REGULATOR_GPIO=y +# CONFIG_REGULATOR_ISL9305 is not set +# CONFIG_REGULATOR_ISL6271A is not set +# CONFIG_REGULATOR_LP3971 is not set +# CONFIG_REGULATOR_LP3972 is not set +# CONFIG_REGULATOR_LP872X is not set +# CONFIG_REGULATOR_LP8755 is not set +# CONFIG_REGULATOR_LTC3589 is not set +# CONFIG_REGULATOR_LTC3676 is not set +# CONFIG_REGULATOR_MAX1586 is not set +# CONFIG_REGULATOR_MAX8649 is not set +# CONFIG_REGULATOR_MAX8660 is not set +# CONFIG_REGULATOR_MAX8893 is not set +# CONFIG_REGULATOR_MAX8952 is not set +# CONFIG_REGULATOR_MAX8973 is not set +# CONFIG_REGULATOR_MAX20086 is not set +# CONFIG_REGULATOR_MAX77826 is not set +# CONFIG_REGULATOR_MCP16502 is not set +# CONFIG_REGULATOR_MP5416 is not set +# CONFIG_REGULATOR_MP8859 is not set +# CONFIG_REGULATOR_MP886X is not set +# CONFIG_REGULATOR_MPQ7920 is not set +# CONFIG_REGULATOR_MT6311 is not set +# CONFIG_REGULATOR_PCA9450 is not set +# CONFIG_REGULATOR_PF8X00 is not set +# CONFIG_REGULATOR_PFUZE100 is not set +# CONFIG_REGULATOR_PV88060 is not set +# CONFIG_REGULATOR_PV88080 is not set +# CONFIG_REGULATOR_PV88090 is not set +# CONFIG_REGULATOR_PWM is not set +CONFIG_REGULATOR_RASPBERRYPI_TOUCHSCREEN_ATTINY=y +# CONFIG_REGULATOR_RT4801 is not set +# CONFIG_REGULATOR_RT5190A is not set +# CONFIG_REGULATOR_RT5759 is not set +# CONFIG_REGULATOR_RT6160 is not set +# CONFIG_REGULATOR_RT6245 is not set +# CONFIG_REGULATOR_RTQ2134 is not set +# CONFIG_REGULATOR_RTMV20 is not set +# CONFIG_REGULATOR_RTQ6752 is not set +# CONFIG_REGULATOR_SLG51000 is not set +# CONFIG_REGULATOR_SY8106A is not set +# CONFIG_REGULATOR_SY8824X is not set +# CONFIG_REGULATOR_SY8827N is not set +# CONFIG_REGULATOR_TPS51632 is not set +# CONFIG_REGULATOR_TPS62360 is not set +# CONFIG_REGULATOR_TPS6286X is not set +# CONFIG_REGULATOR_TPS65023 is not set +# CONFIG_REGULATOR_TPS6507X is not set +# CONFIG_REGULATOR_TPS65132 is not set +# CONFIG_REGULATOR_TPS6524X is not set +# CONFIG_REGULATOR_VCTRL is not set +CONFIG_RC_CORE=y +CONFIG_BPF_LIRC_MODE2=y +CONFIG_LIRC=y +CONFIG_RC_MAP=m +CONFIG_RC_DECODERS=y +CONFIG_IR_IMON_DECODER=m +CONFIG_IR_JVC_DECODER=m +CONFIG_IR_MCE_KBD_DECODER=m +CONFIG_IR_NEC_DECODER=m +CONFIG_IR_RC5_DECODER=m +CONFIG_IR_RC6_DECODER=m +CONFIG_IR_RCMM_DECODER=m +CONFIG_IR_SANYO_DECODER=m +CONFIG_IR_SHARP_DECODER=m +CONFIG_IR_SONY_DECODER=m +CONFIG_IR_XMP_DECODER=m +CONFIG_RC_DEVICES=y +CONFIG_IR_GPIO_CIR=m +CONFIG_IR_GPIO_TX=m +# CONFIG_IR_HIX5HD2 is not set +CONFIG_IR_IGORPLUGUSB=m +CONFIG_IR_IGUANA=m +CONFIG_IR_IMON=m +CONFIG_IR_IMON_RAW=m +CONFIG_IR_MCEUSB=m +CONFIG_IR_PWM_TX=m +CONFIG_IR_REDRAT3=m +# CONFIG_IR_SERIAL is not set +# CONFIG_IR_SPI is not set +CONFIG_IR_STREAMZAP=m +CONFIG_IR_TOY=m +CONFIG_IR_TTUSBIR=m +CONFIG_RC_ATI_REMOTE=m +# CONFIG_RC_LOOPBACK is not set +CONFIG_RC_XBOX_DVD=m +CONFIG_CEC_CORE=y + +# +# CEC support +# +# CONFIG_MEDIA_CEC_RC is not set +# CONFIG_MEDIA_CEC_SUPPORT is not set +# end of CEC support + +CONFIG_MEDIA_SUPPORT=m +# CONFIG_MEDIA_SUPPORT_FILTER is not set +CONFIG_MEDIA_SUBDRV_AUTOSELECT=y + +# +# Media device types +# +CONFIG_MEDIA_CAMERA_SUPPORT=y +CONFIG_MEDIA_ANALOG_TV_SUPPORT=y +CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y +CONFIG_MEDIA_RADIO_SUPPORT=y +CONFIG_MEDIA_SDR_SUPPORT=y +CONFIG_MEDIA_PLATFORM_SUPPORT=y +CONFIG_MEDIA_TEST_SUPPORT=y +# end of Media device types + +# +# Media core support +# +CONFIG_VIDEO_DEV=m +CONFIG_MEDIA_CONTROLLER=y +CONFIG_DVB_CORE=m +# end of Media core support + +# +# Video4Linux options +# +CONFIG_VIDEO_V4L2_I2C=y +CONFIG_VIDEO_V4L2_SUBDEV_API=y +# CONFIG_VIDEO_ADV_DEBUG is not set +# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set +CONFIG_VIDEO_TUNER=m +CONFIG_V4L2_MEM2MEM_DEV=m +# CONFIG_V4L2_FLASH_LED_CLASS is not set +CONFIG_V4L2_FWNODE=m +CONFIG_V4L2_ASYNC=m +# end of Video4Linux options + +# +# Media controller options +# +CONFIG_MEDIA_CONTROLLER_DVB=y +CONFIG_MEDIA_CONTROLLER_REQUEST_API=y +# end of Media controller options + +# +# Digital TV options +# +# CONFIG_DVB_MMAP is not set +CONFIG_DVB_NET=y +CONFIG_DVB_MAX_ADAPTERS=8 +# CONFIG_DVB_DYNAMIC_MINORS is not set +# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set +# CONFIG_DVB_ULE_DEBUG is not set +# end of Digital TV options + +# +# Media drivers +# + +# +# Media drivers +# +CONFIG_MEDIA_USB_SUPPORT=y + +# +# Webcam devices +# +# CONFIG_USB_GSPCA is not set +# CONFIG_USB_PWC is not set +# CONFIG_USB_S2255 is not set +CONFIG_VIDEO_USBTV=m +CONFIG_USB_VIDEO_CLASS=m +# CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV is not set + +# +# Analog TV USB devices +# +# CONFIG_VIDEO_GO7007 is not set +CONFIG_VIDEO_HDPVR=m +CONFIG_VIDEO_PVRUSB2=m +CONFIG_VIDEO_PVRUSB2_SYSFS=y +CONFIG_VIDEO_PVRUSB2_DVB=y +# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set +CONFIG_VIDEO_STK1160_COMMON=m +CONFIG_VIDEO_STK1160=m + +# +# Analog/digital TV USB devices +# +CONFIG_VIDEO_AU0828=m +CONFIG_VIDEO_AU0828_V4L2=y +CONFIG_VIDEO_AU0828_RC=y +CONFIG_VIDEO_CX231XX=m +CONFIG_VIDEO_CX231XX_RC=y +# CONFIG_VIDEO_CX231XX_ALSA is not set +CONFIG_VIDEO_CX231XX_DVB=m + +# +# Digital TV USB devices +# +CONFIG_DVB_AS102=m +CONFIG_DVB_B2C2_FLEXCOP_USB=m +# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set +CONFIG_DVB_USB_V2=m +CONFIG_DVB_USB_AF9015=m +CONFIG_DVB_USB_AF9035=m +CONFIG_DVB_USB_ANYSEE=m +CONFIG_DVB_USB_AU6610=m +CONFIG_DVB_USB_AZ6007=m +CONFIG_DVB_USB_CE6230=m +CONFIG_DVB_USB_DVBSKY=m +CONFIG_DVB_USB_EC168=m +CONFIG_DVB_USB_GL861=m +CONFIG_DVB_USB_LME2510=m +CONFIG_DVB_USB_MXL111SF=m +CONFIG_DVB_USB_RTL28XXU=m +CONFIG_DVB_USB_ZD1301=m +CONFIG_DVB_USB=m +# CONFIG_DVB_USB_DEBUG is not set +CONFIG_DVB_USB_A800=m +CONFIG_DVB_USB_AF9005=m +CONFIG_DVB_USB_AF9005_REMOTE=m +CONFIG_DVB_USB_AZ6027=m +CONFIG_DVB_USB_CINERGY_T2=m +CONFIG_DVB_USB_CXUSB=m +# CONFIG_DVB_USB_CXUSB_ANALOG is not set +CONFIG_DVB_USB_DIB0700=m +CONFIG_DVB_USB_DIB3000MC=m +CONFIG_DVB_USB_DIBUSB_MB=m +CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y +CONFIG_DVB_USB_DIBUSB_MC=m +CONFIG_DVB_USB_DIGITV=m +CONFIG_DVB_USB_DTT200U=m +CONFIG_DVB_USB_DTV5100=m +CONFIG_DVB_USB_DW2102=m +CONFIG_DVB_USB_GP8PSK=m +CONFIG_DVB_USB_M920X=m +CONFIG_DVB_USB_NOVA_T_USB2=m +CONFIG_DVB_USB_OPERA1=m +CONFIG_DVB_USB_PCTV452E=m +CONFIG_DVB_USB_TECHNISAT_USB2=m +CONFIG_DVB_USB_TTUSB2=m +CONFIG_DVB_USB_UMT_010=m +CONFIG_DVB_USB_VP702X=m +CONFIG_DVB_USB_VP7045=m +CONFIG_SMS_USB_DRV=m +# CONFIG_DVB_TTUSB_BUDGET is not set +# CONFIG_DVB_TTUSB_DEC is not set + +# +# Webcam, TV (analog/digital) USB devices +# +CONFIG_VIDEO_EM28XX=m +# CONFIG_VIDEO_EM28XX_V4L2 is not set +# CONFIG_VIDEO_EM28XX_ALSA is not set +CONFIG_VIDEO_EM28XX_DVB=m +CONFIG_VIDEO_EM28XX_RC=m + +# +# Software defined radio USB devices +# +# CONFIG_USB_AIRSPY is not set +# CONFIG_USB_HACKRF is not set +# CONFIG_USB_MSI2500 is not set +# CONFIG_MEDIA_PCI_SUPPORT is not set +# CONFIG_RADIO_ADAPTERS is not set +CONFIG_MEDIA_PLATFORM_DRIVERS=y +# CONFIG_V4L_PLATFORM_DRIVERS is not set +# CONFIG_SDR_PLATFORM_DRIVERS is not set +# CONFIG_DVB_PLATFORM_DRIVERS is not set +# CONFIG_V4L_MEM2MEM_DRIVERS is not set + +# +# Allegro DVT media platform drivers +# + +# +# Amlogic media platform drivers +# + +# +# Amphion drivers +# + +# +# Aspeed media platform drivers +# + +# +# Atmel media platform drivers +# +# CONFIG_VIDEO_BCM2835_UNICAM is not set + +# +# Cadence media platform drivers +# +# CONFIG_VIDEO_CADENCE_CSI2RX is not set +# CONFIG_VIDEO_CADENCE_CSI2TX is not set + +# +# Chips&Media media platform drivers +# + +# +# Intel media platform drivers +# + +# +# Marvell media platform drivers +# + +# +# Mediatek media platform drivers +# + +# +# NVidia media platform drivers +# + +# +# NXP media platform drivers +# + +# +# Qualcomm media platform drivers +# + +# +# Raspberry Pi media platform drivers +# +CONFIG_VIDEO_RASPBERRYPI_PISP_BE=m +CONFIG_VIDEO_RP1_CFE=m + +# +# Renesas media platform drivers +# + +# +# Rockchip media platform drivers +# + +# +# Samsung media platform drivers +# + +# +# STMicroelectronics media platform drivers +# + +# +# Sunxi media platform drivers +# + +# +# Texas Instruments drivers +# + +# +# Verisilicon media platform drivers +# + +# +# VIA media platform drivers +# + +# +# Xilinx media platform drivers +# + +# +# MMC/SDIO DVB adapters +# +CONFIG_SMS_SDIO_DRV=m +# CONFIG_V4L_TEST_DRIVERS is not set +# CONFIG_DVB_TEST_DRIVERS is not set +CONFIG_MEDIA_COMMON_OPTIONS=y + +# +# common driver options +# +CONFIG_CYPRESS_FIRMWARE=m +CONFIG_TTPCI_EEPROM=m +CONFIG_VIDEO_CX2341X=m +CONFIG_VIDEO_TVEEPROM=m +CONFIG_DVB_B2C2_FLEXCOP=m +CONFIG_SMS_SIANO_MDTV=m +CONFIG_SMS_SIANO_RC=y +# CONFIG_SMS_SIANO_DEBUGFS is not set +CONFIG_VIDEOBUF2_CORE=m +CONFIG_VIDEOBUF2_V4L2=m +CONFIG_VIDEOBUF2_MEMOPS=m +CONFIG_VIDEOBUF2_DMA_CONTIG=m +CONFIG_VIDEOBUF2_VMALLOC=m +# end of Media drivers + +# +# Media ancillary drivers +# +CONFIG_MEDIA_ATTACH=y + +# +# IR I2C driver auto-selected by 'Autoselect ancillary drivers' +# +CONFIG_VIDEO_IR_I2C=m +CONFIG_VIDEO_CAMERA_SENSOR=y +# CONFIG_VIDEO_AR0521 is not set +# CONFIG_VIDEO_ARDUCAM_64MP is not set +# CONFIG_VIDEO_ARDUCAM_PIVARIETY is not set +# CONFIG_VIDEO_HI556 is not set +# CONFIG_VIDEO_HI846 is not set +# CONFIG_VIDEO_HI847 is not set +# CONFIG_VIDEO_IMX208 is not set +# CONFIG_VIDEO_IMX214 is not set +# CONFIG_VIDEO_IMX219 is not set +# CONFIG_VIDEO_IMX258 is not set +# CONFIG_VIDEO_IMX274 is not set +# CONFIG_VIDEO_IMX290 is not set +# CONFIG_VIDEO_IMX296 is not set +# CONFIG_VIDEO_IMX319 is not set +# CONFIG_VIDEO_IMX334 is not set +# CONFIG_VIDEO_IMX335 is not set +# CONFIG_VIDEO_IMX355 is not set +# CONFIG_VIDEO_IMX412 is not set +# CONFIG_VIDEO_IMX477 is not set +# CONFIG_VIDEO_IMX519 is not set +# CONFIG_VIDEO_IMX708 is not set +# CONFIG_VIDEO_MT9M001 is not set +# CONFIG_VIDEO_MT9M032 is not set +# CONFIG_VIDEO_MT9M111 is not set +# CONFIG_VIDEO_MT9P031 is not set +# CONFIG_VIDEO_MT9T001 is not set +# CONFIG_VIDEO_MT9T112 is not set +# CONFIG_VIDEO_MT9V011 is not set +# CONFIG_VIDEO_MT9V032 is not set +# CONFIG_VIDEO_MT9V111 is not set +# CONFIG_VIDEO_NOON010PC30 is not set +# CONFIG_VIDEO_OG01A1B is not set +# CONFIG_VIDEO_OV02A10 is not set +# CONFIG_VIDEO_OV08D10 is not set +# CONFIG_VIDEO_OV13858 is not set +# CONFIG_VIDEO_OV13B10 is not set +# CONFIG_VIDEO_OV2311 is not set +# CONFIG_VIDEO_OV2640 is not set +# CONFIG_VIDEO_OV2659 is not set +# CONFIG_VIDEO_OV2680 is not set +# CONFIG_VIDEO_OV2685 is not set +# CONFIG_VIDEO_OV5640 is not set +# CONFIG_VIDEO_OV5645 is not set +# CONFIG_VIDEO_OV5647 is not set +# CONFIG_VIDEO_OV5648 is not set +# CONFIG_VIDEO_OV5670 is not set +# CONFIG_VIDEO_OV5675 is not set +# CONFIG_VIDEO_OV5693 is not set +# CONFIG_VIDEO_OV5695 is not set +# CONFIG_VIDEO_OV6650 is not set +# CONFIG_VIDEO_OV7251 is not set +# CONFIG_VIDEO_OV7640 is not set +# CONFIG_VIDEO_OV7670 is not set +# CONFIG_VIDEO_OV772X is not set +# CONFIG_VIDEO_OV7740 is not set +# CONFIG_VIDEO_OV8856 is not set +# CONFIG_VIDEO_OV8865 is not set +# CONFIG_VIDEO_OV9282 is not set +# CONFIG_VIDEO_OV9640 is not set +# CONFIG_VIDEO_OV9650 is not set +# CONFIG_VIDEO_RDACM20 is not set +# CONFIG_VIDEO_RDACM21 is not set +# CONFIG_VIDEO_RJ54N1 is not set +# CONFIG_VIDEO_S5C73M3 is not set +# CONFIG_VIDEO_S5K4ECGX is not set +# CONFIG_VIDEO_S5K5BAF is not set +# CONFIG_VIDEO_S5K6A3 is not set +# CONFIG_VIDEO_S5K6AA is not set +# CONFIG_VIDEO_SR030PC30 is not set +# CONFIG_VIDEO_VS6624 is not set +# CONFIG_VIDEO_CCS is not set +# CONFIG_VIDEO_ET8EK8 is not set +# CONFIG_VIDEO_M5MOLS is not set + +# +# Lens drivers +# +# CONFIG_VIDEO_AD5398 is not set +# CONFIG_VIDEO_AD5820 is not set +# CONFIG_VIDEO_AK7375 is not set +# CONFIG_VIDEO_DW9714 is not set +# CONFIG_VIDEO_DW9768 is not set +# CONFIG_VIDEO_DW9807_VCM is not set +# end of Lens drivers + +# +# Flash devices +# +# CONFIG_VIDEO_ADP1653 is not set +# CONFIG_VIDEO_LM3560 is not set +# CONFIG_VIDEO_LM3646 is not set +# end of Flash devices + +# +# Audio decoders, processors and mixers +# +# CONFIG_VIDEO_CS3308 is not set +# CONFIG_VIDEO_CS5345 is not set +CONFIG_VIDEO_CS53L32A=m +CONFIG_VIDEO_MSP3400=m +# CONFIG_VIDEO_SONY_BTF_MPX is not set +# CONFIG_VIDEO_TDA1997X is not set +# CONFIG_VIDEO_TDA7432 is not set +# CONFIG_VIDEO_TDA9840 is not set +# CONFIG_VIDEO_TEA6415C is not set +# CONFIG_VIDEO_TEA6420 is not set +# CONFIG_VIDEO_TLV320AIC23B is not set +# CONFIG_VIDEO_TVAUDIO is not set +# CONFIG_VIDEO_UDA1342 is not set +# CONFIG_VIDEO_VP27SMPX is not set +# CONFIG_VIDEO_WM8739 is not set +CONFIG_VIDEO_WM8775=m +# end of Audio decoders, processors and mixers + +# +# RDS decoders +# +# CONFIG_VIDEO_SAA6588 is not set +# end of RDS decoders + +# +# Video decoders +# +# CONFIG_VIDEO_ADV7180 is not set +# CONFIG_VIDEO_ADV7183 is not set +# CONFIG_VIDEO_ADV748X is not set +# CONFIG_VIDEO_ADV7604 is not set +# CONFIG_VIDEO_ADV7842 is not set +# CONFIG_VIDEO_BT819 is not set +# CONFIG_VIDEO_BT856 is not set +# CONFIG_VIDEO_BT866 is not set +# CONFIG_VIDEO_ISL7998X is not set +# CONFIG_VIDEO_KS0127 is not set +# CONFIG_VIDEO_MAX9286 is not set +# CONFIG_VIDEO_ML86V7667 is not set +# CONFIG_VIDEO_SAA7110 is not set +CONFIG_VIDEO_SAA711X=m +# CONFIG_VIDEO_TC358743 is not set +# CONFIG_VIDEO_TVP514X is not set +# CONFIG_VIDEO_TVP5150 is not set +# CONFIG_VIDEO_TVP7002 is not set +# CONFIG_VIDEO_TW2804 is not set +# CONFIG_VIDEO_OV9281 is not set +# CONFIG_VIDEO_TW9903 is not set +# CONFIG_VIDEO_TW9906 is not set +# CONFIG_VIDEO_TW9910 is not set +# CONFIG_VIDEO_IRS1125 is not set +# CONFIG_VIDEO_VPX3220 is not set + +# +# Video and audio decoders +# +# CONFIG_VIDEO_SAA717X is not set +CONFIG_VIDEO_CX25840=m +# end of Video decoders + +# +# Video encoders +# +# CONFIG_VIDEO_AD9389B is not set +# CONFIG_VIDEO_ADV7170 is not set +# CONFIG_VIDEO_ADV7175 is not set +# CONFIG_VIDEO_ADV7343 is not set +# CONFIG_VIDEO_ADV7393 is not set +# CONFIG_VIDEO_ADV7511 is not set +# CONFIG_VIDEO_AK881X is not set +# CONFIG_VIDEO_SAA7127 is not set +# CONFIG_VIDEO_SAA7185 is not set +# CONFIG_VIDEO_THS8200 is not set +# end of Video encoders + +# +# Video improvement chips +# +# CONFIG_VIDEO_UPD64031A is not set +# CONFIG_VIDEO_UPD64083 is not set +# end of Video improvement chips + +# +# Audio/Video compression chips +# +# CONFIG_VIDEO_SAA6752HS is not set +# end of Audio/Video compression chips + +# +# SDR tuner chips +# +# CONFIG_SDR_MAX2175 is not set +# end of SDR tuner chips + +# +# Miscellaneous helper chips +# +# CONFIG_VIDEO_I2C is not set +# CONFIG_VIDEO_M52790 is not set +# CONFIG_VIDEO_ST_MIPID02 is not set +# CONFIG_VIDEO_THS7303 is not set +# end of Miscellaneous helper chips + +# +# Media SPI Adapters +# +CONFIG_CXD2880_SPI_DRV=m +# CONFIG_VIDEO_GS1662 is not set +# end of Media SPI Adapters + +CONFIG_MEDIA_TUNER=m + +# +# Customize TV tuners +# +CONFIG_MEDIA_TUNER_E4000=m +CONFIG_MEDIA_TUNER_FC0011=m +CONFIG_MEDIA_TUNER_FC0012=m +CONFIG_MEDIA_TUNER_FC0013=m +CONFIG_MEDIA_TUNER_FC2580=m +CONFIG_MEDIA_TUNER_IT913X=m +# CONFIG_MEDIA_TUNER_M88RS6000T is not set +CONFIG_MEDIA_TUNER_MAX2165=m +CONFIG_MEDIA_TUNER_MC44S803=m +# CONFIG_MEDIA_TUNER_MSI001 is not set +CONFIG_MEDIA_TUNER_MT2060=m +CONFIG_MEDIA_TUNER_MT2063=m +CONFIG_MEDIA_TUNER_MT20XX=m +# CONFIG_MEDIA_TUNER_MT2131 is not set +CONFIG_MEDIA_TUNER_MT2266=m +# CONFIG_MEDIA_TUNER_MXL301RF is not set +CONFIG_MEDIA_TUNER_MXL5005S=m +CONFIG_MEDIA_TUNER_MXL5007T=m +# CONFIG_MEDIA_TUNER_QM1D1B0004 is not set +CONFIG_MEDIA_TUNER_QM1D1C0042=m +CONFIG_MEDIA_TUNER_QT1010=m +CONFIG_MEDIA_TUNER_R820T=m +CONFIG_MEDIA_TUNER_SI2157=m +CONFIG_MEDIA_TUNER_SIMPLE=m +CONFIG_MEDIA_TUNER_TDA18212=m +CONFIG_MEDIA_TUNER_TDA18218=m +CONFIG_MEDIA_TUNER_TDA18250=m +CONFIG_MEDIA_TUNER_TDA18271=m +CONFIG_MEDIA_TUNER_TDA827X=m +CONFIG_MEDIA_TUNER_TDA8290=m +CONFIG_MEDIA_TUNER_TDA9887=m +CONFIG_MEDIA_TUNER_TEA5761=m +CONFIG_MEDIA_TUNER_TEA5767=m +CONFIG_MEDIA_TUNER_TUA9001=m +CONFIG_MEDIA_TUNER_XC2028=m +CONFIG_MEDIA_TUNER_XC4000=m +CONFIG_MEDIA_TUNER_XC5000=m +# end of Customize TV tuners + +# +# Customise DVB Frontends +# + +# +# Multistandard (satellite) frontends +# +CONFIG_DVB_M88DS3103=m +# CONFIG_DVB_MXL5XX is not set +CONFIG_DVB_STB0899=m +CONFIG_DVB_STB6100=m +CONFIG_DVB_STV090x=m +# CONFIG_DVB_STV0910 is not set +CONFIG_DVB_STV6110x=m +# CONFIG_DVB_STV6111 is not set + +# +# Multistandard (cable + terrestrial) frontends +# +CONFIG_DVB_DRXK=m +CONFIG_DVB_MN88472=m +CONFIG_DVB_MN88473=m +CONFIG_DVB_SI2165=m +CONFIG_DVB_TDA18271C2DD=m + +# +# DVB-S (satellite) frontends +# +# CONFIG_DVB_CX24110 is not set +CONFIG_DVB_CX24116=m +# CONFIG_DVB_CX24117 is not set +CONFIG_DVB_CX24120=m +CONFIG_DVB_CX24123=m +CONFIG_DVB_DS3000=m +# CONFIG_DVB_MB86A16 is not set +CONFIG_DVB_MT312=m +CONFIG_DVB_S5H1420=m +CONFIG_DVB_SI21XX=m +CONFIG_DVB_STB6000=m +CONFIG_DVB_STV0288=m +CONFIG_DVB_STV0299=m +CONFIG_DVB_STV0900=m +CONFIG_DVB_STV6110=m +CONFIG_DVB_TDA10071=m +CONFIG_DVB_TDA10086=m +# CONFIG_DVB_TDA8083 is not set +# CONFIG_DVB_TDA8261 is not set +CONFIG_DVB_TDA826X=m +CONFIG_DVB_TS2020=m +# CONFIG_DVB_TUA6100 is not set +CONFIG_DVB_TUNER_CX24113=m +CONFIG_DVB_TUNER_ITD1000=m +# CONFIG_DVB_VES1X93 is not set +# CONFIG_DVB_ZL10036 is not set +CONFIG_DVB_ZL10039=m + +# +# DVB-T (terrestrial) frontends +# +CONFIG_DVB_AF9013=m +CONFIG_DVB_AS102_FE=m +# CONFIG_DVB_CX22700 is not set +CONFIG_DVB_CX22702=m +CONFIG_DVB_CXD2820R=m +CONFIG_DVB_CXD2841ER=m +CONFIG_DVB_DIB3000MB=m +CONFIG_DVB_DIB3000MC=m +CONFIG_DVB_DIB7000M=m +CONFIG_DVB_DIB7000P=m +# CONFIG_DVB_DIB9000 is not set +CONFIG_DVB_DRXD=m +CONFIG_DVB_EC100=m +CONFIG_DVB_GP8PSK_FE=m +# CONFIG_DVB_L64781 is not set +CONFIG_DVB_MT352=m +CONFIG_DVB_NXT6000=m +CONFIG_DVB_RTL2830=m +CONFIG_DVB_RTL2832=m +CONFIG_DVB_RTL2832_SDR=m +# CONFIG_DVB_S5H1432 is not set +CONFIG_DVB_SI2168=m +# CONFIG_DVB_SP887X is not set +# CONFIG_DVB_STV0367 is not set +CONFIG_DVB_TDA10048=m +CONFIG_DVB_TDA1004X=m +CONFIG_DVB_ZD1301_DEMOD=m +CONFIG_DVB_ZL10353=m +CONFIG_DVB_CXD2880=m + +# +# DVB-C (cable) frontends +# +CONFIG_DVB_STV0297=m +# CONFIG_DVB_TDA10021 is not set +CONFIG_DVB_TDA10023=m +# CONFIG_DVB_VES1820 is not set + +# +# ATSC (North American/Korean Terrestrial/Cable DTV) frontends +# +CONFIG_DVB_AU8522=m +CONFIG_DVB_AU8522_DTV=m +CONFIG_DVB_AU8522_V4L=m +CONFIG_DVB_BCM3510=m +CONFIG_DVB_LG2160=m +CONFIG_DVB_LGDT3305=m +CONFIG_DVB_LGDT3306A=m +CONFIG_DVB_LGDT330X=m +CONFIG_DVB_MXL692=m +CONFIG_DVB_NXT200X=m +# CONFIG_DVB_OR51132 is not set +# CONFIG_DVB_OR51211 is not set +CONFIG_DVB_S5H1409=m +CONFIG_DVB_S5H1411=m + +# +# ISDB-T (terrestrial) frontends +# +CONFIG_DVB_DIB8000=m +CONFIG_DVB_MB86A20S=m +CONFIG_DVB_S921=m + +# +# ISDB-S (satellite) & ISDB-T (terrestrial) frontends +# +# CONFIG_DVB_MN88443X is not set +CONFIG_DVB_TC90522=m + +# +# Digital terrestrial only tuners/PLL +# +CONFIG_DVB_PLL=m +CONFIG_DVB_TUNER_DIB0070=m +CONFIG_DVB_TUNER_DIB0090=m + +# +# SEC control devices for DVB-S +# +CONFIG_DVB_A8293=m +CONFIG_DVB_AF9033=m +# CONFIG_DVB_ASCOT2E is not set +CONFIG_DVB_ATBM8830=m +# CONFIG_DVB_HELENE is not set +# CONFIG_DVB_HORUS3A is not set +# CONFIG_DVB_ISL6405 is not set +CONFIG_DVB_ISL6421=m +CONFIG_DVB_ISL6423=m +CONFIG_DVB_IX2505V=m +# CONFIG_DVB_LGS8GL5 is not set +CONFIG_DVB_LGS8GXX=m +# CONFIG_DVB_LNBH25 is not set +# CONFIG_DVB_LNBH29 is not set +CONFIG_DVB_LNBP21=m +CONFIG_DVB_LNBP22=m +CONFIG_DVB_M88RS2000=m +# CONFIG_DVB_TDA665x is not set +CONFIG_DVB_DRX39XYJ=m + +# +# Common Interface (EN50221) controller drivers +# +# CONFIG_DVB_CXD2099 is not set +CONFIG_DVB_SP2=m +# end of Customise DVB Frontends + +# +# Tools to develop new frontends +# +# CONFIG_DVB_DUMMY_FE is not set +# end of Media ancillary drivers + +# +# Graphics support +# +CONFIG_DRM=y +CONFIG_DRM_MIPI_DSI=y +# CONFIG_DRM_DEBUG_MM is not set +CONFIG_DRM_KMS_HELPER=y +# CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS is not set +# CONFIG_DRM_DEBUG_MODESET_LOCK is not set +CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_FBDEV_OVERALLOC=100 +# CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM is not set +CONFIG_DRM_LOAD_EDID_FIRMWARE=y +CONFIG_DRM_DISPLAY_HELPER=y +CONFIG_DRM_DISPLAY_HDMI_HELPER=y +# CONFIG_DRM_DP_AUX_CHARDEV is not set +# CONFIG_DRM_DP_CEC is not set +CONFIG_DRM_TTM=y +CONFIG_DRM_VRAM_HELPER=y +CONFIG_DRM_TTM_HELPER=y +CONFIG_DRM_GEM_DMA_HELPER=y +CONFIG_DRM_GEM_SHMEM_HELPER=y +CONFIG_DRM_SCHED=y + +# +# I2C encoder or helper chips +# +# CONFIG_DRM_I2C_CH7006 is not set +# CONFIG_DRM_I2C_SIL164 is not set +# CONFIG_DRM_I2C_NXP_TDA998X is not set +# CONFIG_DRM_I2C_NXP_TDA9950 is not set +# end of I2C encoder or helper chips + +# +# ARM devices +# +# CONFIG_DRM_HDLCD is not set +# CONFIG_DRM_MALI_DISPLAY is not set +# CONFIG_DRM_KOMEDA is not set +# end of ARM devices + +# CONFIG_DRM_RADEON is not set +# CONFIG_DRM_AMDGPU is not set +# CONFIG_DRM_NOUVEAU is not set +# CONFIG_DRM_VGEM is not set +# CONFIG_DRM_VKMS is not set +# CONFIG_DRM_VMWGFX is not set +# CONFIG_DRM_UDL is not set +# CONFIG_DRM_AST is not set +# CONFIG_DRM_MGAG200 is not set +# CONFIG_DRM_RCAR_DW_HDMI is not set +# CONFIG_DRM_RCAR_USE_LVDS is not set +# CONFIG_DRM_RCAR_USE_MIPI_DSI is not set +# CONFIG_DRM_QXL is not set +CONFIG_DRM_PANEL=y + +# +# Display Panels +# +# CONFIG_DRM_PANEL_ABT_Y030XX067A is not set +# CONFIG_DRM_PANEL_ARM_VERSATILE is not set +# CONFIG_DRM_PANEL_ASUS_Z00T_TM5P5_NT35596 is not set +# CONFIG_DRM_PANEL_BOE_BF060Y8M_AJ0 is not set +# CONFIG_DRM_PANEL_BOE_HIMAX8279D is not set +# CONFIG_DRM_PANEL_BOE_TV101WUM_NL6 is not set +# CONFIG_DRM_PANEL_DSI_CM is not set +# CONFIG_DRM_PANEL_LVDS is not set +CONFIG_DRM_PANEL_SIMPLE=y +# CONFIG_DRM_PANEL_EDP is not set +# CONFIG_DRM_PANEL_EBBG_FT8719 is not set +# CONFIG_DRM_PANEL_ELIDA_KD35T133 is not set +# CONFIG_DRM_PANEL_FEIXIN_K101_IM2BA02 is not set +# CONFIG_DRM_PANEL_FEIYANG_FY07024DI26A30D is not set +# CONFIG_DRM_PANEL_ILITEK_IL9322 is not set +# CONFIG_DRM_PANEL_ILITEK_ILI9341 is not set +# CONFIG_DRM_PANEL_ILITEK_ILI9806E is not set +# CONFIG_DRM_PANEL_ILITEK_ILI9881C is not set +# CONFIG_DRM_PANEL_INNOLUX_EJ030NA is not set +# CONFIG_DRM_PANEL_INNOLUX_P079ZCA is not set +# CONFIG_DRM_PANEL_JDI_LT070ME05000 is not set +# CONFIG_DRM_PANEL_JDI_R63452 is not set +# CONFIG_DRM_PANEL_KHADAS_TS050 is not set +# CONFIG_DRM_PANEL_KINGDISPLAY_KD097D04 is not set +# CONFIG_DRM_PANEL_LEADTEK_LTK050H3146W is not set +# CONFIG_DRM_PANEL_LEADTEK_LTK500HD1829 is not set +# CONFIG_DRM_PANEL_SAMSUNG_LD9040 is not set +# CONFIG_DRM_PANEL_LG_LB035Q02 is not set +# CONFIG_DRM_PANEL_LG_LG4573 is not set +# CONFIG_DRM_PANEL_NEC_NL8048HL11 is not set +# CONFIG_DRM_PANEL_NEWVISION_NV3052C is not set +# CONFIG_DRM_PANEL_NOVATEK_NT35510 is not set +# CONFIG_DRM_PANEL_NOVATEK_NT35560 is not set +# CONFIG_DRM_PANEL_NOVATEK_NT35950 is not set +# CONFIG_DRM_PANEL_NOVATEK_NT36672A is not set +# CONFIG_DRM_PANEL_NOVATEK_NT39016 is not set +# CONFIG_DRM_PANEL_MANTIX_MLAF057WE51 is not set +# CONFIG_DRM_PANEL_OLIMEX_LCD_OLINUXINO is not set +# CONFIG_DRM_PANEL_ORISETECH_OTM8009A is not set +# CONFIG_DRM_PANEL_OSD_OSD101T2587_53TS is not set +# CONFIG_DRM_PANEL_PANASONIC_VVX10F034N00 is not set +# CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN is not set +# CONFIG_DRM_PANEL_RAYDIUM_RM67191 is not set +# CONFIG_DRM_PANEL_RAYDIUM_RM68200 is not set +# CONFIG_DRM_PANEL_RONBO_RB070D30 is not set +# CONFIG_DRM_PANEL_SAMSUNG_ATNA33XC20 is not set +# CONFIG_DRM_PANEL_SAMSUNG_DB7430 is not set +# CONFIG_DRM_PANEL_SAMSUNG_S6D16D0 is not set +# CONFIG_DRM_PANEL_SAMSUNG_S6D27A1 is not set +# CONFIG_DRM_PANEL_SAMSUNG_S6E3HA2 is not set +# CONFIG_DRM_PANEL_SAMSUNG_S6E63J0X03 is not set +# CONFIG_DRM_PANEL_SAMSUNG_S6E63M0 is not set +# CONFIG_DRM_PANEL_SAMSUNG_S6E88A0_AMS452EF01 is not set +# CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0 is not set +# CONFIG_DRM_PANEL_SAMSUNG_SOFEF00 is not set +# CONFIG_DRM_PANEL_SEIKO_43WVF1G is not set +# CONFIG_DRM_PANEL_SHARP_LQ101R1SX01 is not set +# CONFIG_DRM_PANEL_SHARP_LS037V7DW01 is not set +# CONFIG_DRM_PANEL_SHARP_LS043T1LE01 is not set +# CONFIG_DRM_PANEL_SHARP_LS060T1SX01 is not set +# CONFIG_DRM_PANEL_SITRONIX_ST7701 is not set +# CONFIG_DRM_PANEL_SITRONIX_ST7703 is not set +# CONFIG_DRM_PANEL_SITRONIX_ST7789V is not set +# CONFIG_DRM_PANEL_SONY_ACX565AKM is not set +# CONFIG_DRM_PANEL_SONY_TULIP_TRULY_NT35521 is not set +# CONFIG_DRM_PANEL_TDO_TL070WSH30 is not set +# CONFIG_DRM_PANEL_TPO_Y17P is not set +# CONFIG_DRM_PANEL_TPO_TD028TTEC1 is not set +# CONFIG_DRM_PANEL_TPO_TD043MTEA1 is not set +# CONFIG_DRM_PANEL_TPO_TPG110 is not set +# CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA is not set +# CONFIG_DRM_PANEL_VISIONOX_RM69299 is not set +# CONFIG_DRM_PANEL_WAVESHARE_TOUCHSCREEN is not set +# CONFIG_DRM_PANEL_WIDECHIPS_WS2401 is not set +# CONFIG_DRM_PANEL_XINPENG_XPP055C272 is not set +# end of Display Panels + +CONFIG_DRM_BRIDGE=y +CONFIG_DRM_PANEL_BRIDGE=y + +# +# Display Interface Bridges +# +# CONFIG_DRM_CDNS_DSI is not set +# CONFIG_DRM_CHIPONE_ICN6211 is not set +# CONFIG_DRM_CHRONTEL_CH7033 is not set +# CONFIG_DRM_DISPLAY_CONNECTOR is not set +# CONFIG_DRM_ITE_IT6505 is not set +# CONFIG_DRM_LONTIUM_LT8912B is not set +# CONFIG_DRM_LONTIUM_LT9211 is not set +# CONFIG_DRM_LONTIUM_LT9611 is not set +# CONFIG_DRM_LONTIUM_LT9611UXC is not set +# CONFIG_DRM_ITE_IT66121 is not set +# CONFIG_DRM_LVDS_CODEC is not set +# CONFIG_DRM_MEGACHIPS_STDPXXXX_GE_B850V3_FW is not set +# CONFIG_DRM_NWL_MIPI_DSI is not set +# CONFIG_DRM_NXP_PTN3460 is not set +# CONFIG_DRM_PARADE_PS8622 is not set +# CONFIG_DRM_PARADE_PS8640 is not set +# CONFIG_DRM_SIL_SII8620 is not set +# CONFIG_DRM_SII902X is not set +# CONFIG_DRM_SII9234 is not set +CONFIG_DRM_SIMPLE_BRIDGE=y +# CONFIG_DRM_THINE_THC63LVD1024 is not set +CONFIG_DRM_TOSHIBA_TC358762=y +# CONFIG_DRM_TOSHIBA_TC358764 is not set +# CONFIG_DRM_TOSHIBA_TC358767 is not set +# CONFIG_DRM_TOSHIBA_TC358768 is not set +# CONFIG_DRM_TOSHIBA_TC358775 is not set +# CONFIG_DRM_TI_DLPC3433 is not set +# CONFIG_DRM_TI_TFP410 is not set +# CONFIG_DRM_TI_SN65DSI83 is not set +# CONFIG_DRM_TI_SN65DSI86 is not set +# CONFIG_DRM_TI_TPD12S015 is not set +# CONFIG_DRM_ANALOGIX_ANX6345 is not set +# CONFIG_DRM_ANALOGIX_ANX78XX is not set +# CONFIG_DRM_ANALOGIX_ANX7625 is not set +# CONFIG_DRM_I2C_ADV7511 is not set +# CONFIG_DRM_CDNS_MHDP8546 is not set +# end of Display Interface Bridges + +CONFIG_DRM_V3D=y +CONFIG_DRM_VC4=y +CONFIG_DRM_VC4_HDMI_CEC=y +CONFIG_DRM_RP1_DSI=y +CONFIG_DRM_RP1_DPI=y +CONFIG_DRM_RP1_VEC=y +# CONFIG_DRM_ETNAVIV is not set +# CONFIG_DRM_HISI_HIBMC is not set +# CONFIG_DRM_HISI_KIRIN is not set +# CONFIG_DRM_LOGICVC is not set +# CONFIG_DRM_ARCPGU is not set +# CONFIG_DRM_BOCHS is not set +# CONFIG_DRM_CIRRUS_QEMU is not set +# CONFIG_DRM_GM12U320 is not set +# CONFIG_DRM_PANEL_MIPI_DBI is not set +# CONFIG_DRM_SIMPLEDRM is not set +# CONFIG_TINYDRM_HX8357D is not set +# CONFIG_TINYDRM_ILI9163 is not set +# CONFIG_TINYDRM_ILI9225 is not set +# CONFIG_TINYDRM_ILI9341 is not set +# CONFIG_TINYDRM_ILI9486 is not set +# CONFIG_TINYDRM_MI0283QT is not set +# CONFIG_TINYDRM_REPAPER is not set +# CONFIG_TINYDRM_ST7586 is not set +# CONFIG_TINYDRM_ST7735R is not set +# CONFIG_DRM_PL111 is not set +# CONFIG_DRM_LIMA is not set +# CONFIG_DRM_PANFROST is not set +# CONFIG_DRM_TIDSS is not set +# CONFIG_DRM_GUD is not set +# CONFIG_DRM_SSD130X is not set +# CONFIG_DRM_LEGACY is not set +CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y +CONFIG_DRM_NOMODESET=y + +# +# Frame buffer Devices +# +CONFIG_FB_CMDLINE=y +CONFIG_FB_NOTIFY=y +CONFIG_FB=y +# CONFIG_FIRMWARE_EDID is not set +CONFIG_FB_CFB_FILLRECT=y +CONFIG_FB_CFB_COPYAREA=y +CONFIG_FB_CFB_IMAGEBLIT=y +CONFIG_FB_SYS_FILLRECT=y +CONFIG_FB_SYS_COPYAREA=y +CONFIG_FB_SYS_IMAGEBLIT=y +# CONFIG_FB_FOREIGN_ENDIAN is not set +CONFIG_FB_SYS_FOPS=y +CONFIG_FB_DEFERRED_IO=y +# CONFIG_FB_MODE_HELPERS is not set +# CONFIG_FB_TILEBLITTING is not set + +# +# Frame buffer hardware drivers +# +# CONFIG_FB_BCM2708 is not set +# CONFIG_FB_CIRRUS is not set +# CONFIG_FB_PM2 is not set +# CONFIG_FB_ARMCLCD is not set +# CONFIG_FB_CYBER2000 is not set +# CONFIG_FB_ASILIANT is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_EFI is not set +# CONFIG_FB_OPENCORES is not set +# CONFIG_FB_S1D13XXX is not set +# CONFIG_FB_NVIDIA is not set +# CONFIG_FB_RIVA is not set +# CONFIG_FB_I740 is not set +# CONFIG_FB_MATROX is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_S3 is not set +# CONFIG_FB_SAVAGE is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_VT8623 is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_ARK is not set +# CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set +# CONFIG_FB_SMSCUFX is not set +# CONFIG_FB_UDL is not set +# CONFIG_FB_IBM_GXT4500 is not set +# CONFIG_FB_VIRTUAL is not set +# CONFIG_FB_METRONOME is not set +# CONFIG_FB_MB862XX is not set +# CONFIG_FB_SIMPLE is not set +# CONFIG_FB_SSD1307 is not set +# CONFIG_FB_SM712 is not set +# CONFIG_FB_RPISENSE is not set +# end of Frame buffer Devices + +# +# Backlight & LCD device support +# +CONFIG_LCD_CLASS_DEVICE=m +# CONFIG_LCD_L4F00242T03 is not set +# CONFIG_LCD_LMS283GF05 is not set +# CONFIG_LCD_LTV350QV is not set +# CONFIG_LCD_ILI922X is not set +# CONFIG_LCD_ILI9320 is not set +# CONFIG_LCD_TDO24M is not set +# CONFIG_LCD_VGG2432A4 is not set +# CONFIG_LCD_PLATFORM is not set +# CONFIG_LCD_AMS369FG06 is not set +# CONFIG_LCD_LMS501KF03 is not set +# CONFIG_LCD_HX8357 is not set +# CONFIG_LCD_OTM3225A is not set +CONFIG_BACKLIGHT_CLASS_DEVICE=y +# CONFIG_BACKLIGHT_KTD253 is not set +# CONFIG_BACKLIGHT_PWM is not set +CONFIG_BACKLIGHT_RPI=y +# CONFIG_BACKLIGHT_QCOM_WLED is not set +# CONFIG_BACKLIGHT_ADP8860 is not set +# CONFIG_BACKLIGHT_ADP8870 is not set +# CONFIG_BACKLIGHT_LM3630A is not set +# CONFIG_BACKLIGHT_LM3639 is not set +# CONFIG_BACKLIGHT_LP855X is not set +CONFIG_BACKLIGHT_GPIO=y +# CONFIG_BACKLIGHT_LV5207LP is not set +# CONFIG_BACKLIGHT_BD6107 is not set +# CONFIG_BACKLIGHT_ARCXCNN is not set +# CONFIG_BACKLIGHT_LED is not set +# end of Backlight & LCD device support + +CONFIG_VIDEOMODE_HELPERS=y +CONFIG_HDMI=y + +# +# Console display driver support +# +CONFIG_DUMMY_CONSOLE=y +CONFIG_DUMMY_CONSOLE_COLUMNS=80 +CONFIG_DUMMY_CONSOLE_ROWS=25 +CONFIG_FRAMEBUFFER_CONSOLE=y +# CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION is not set +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y +# CONFIG_FRAMEBUFFER_CONSOLE_ROTATION is not set +# CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER is not set +# end of Console display driver support + +# CONFIG_LOGO is not set +# end of Graphics support + +CONFIG_SOUND=y +CONFIG_SND=y +CONFIG_SND_TIMER=y +CONFIG_SND_PCM=y +CONFIG_SND_PCM_ELD=y +CONFIG_SND_PCM_IEC958=y +CONFIG_SND_DMAENGINE_PCM=y +CONFIG_SND_HWDEP=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_COMPRESS_OFFLOAD=y +CONFIG_SND_JACK=y +CONFIG_SND_JACK_INPUT_DEV=y +# CONFIG_SND_OSSEMUL is not set +CONFIG_SND_PCM_TIMER=y +CONFIG_SND_HRTIMER=m +CONFIG_SND_DYNAMIC_MINORS=y +CONFIG_SND_MAX_CARDS=32 +# CONFIG_SND_SUPPORT_OLD_API is not set +CONFIG_SND_PROC_FS=y +CONFIG_SND_VERBOSE_PROCFS=y +# CONFIG_SND_VERBOSE_PRINTK is not set +CONFIG_SND_CTL_FAST_LOOKUP=y +# CONFIG_SND_DEBUG is not set +# CONFIG_SND_CTL_INPUT_VALIDATION is not set +CONFIG_SND_VMASTER=y +# CONFIG_SND_SEQUENCER is not set +# CONFIG_SND_DRIVERS is not set +CONFIG_SND_PCI=y +# CONFIG_SND_AD1889 is not set +# CONFIG_SND_ALS300 is not set +# CONFIG_SND_ALI5451 is not set +# CONFIG_SND_ATIIXP is not set +# CONFIG_SND_ATIIXP_MODEM is not set +# CONFIG_SND_AU8810 is not set +# CONFIG_SND_AU8820 is not set +# CONFIG_SND_AU8830 is not set +# CONFIG_SND_AW2 is not set +# CONFIG_SND_AZT3328 is not set +# CONFIG_SND_BT87X is not set +# CONFIG_SND_CA0106 is not set +# CONFIG_SND_CMIPCI is not set +# CONFIG_SND_OXYGEN is not set +# CONFIG_SND_CS4281 is not set +# CONFIG_SND_CS46XX is not set +# CONFIG_SND_CTXFI is not set +# CONFIG_SND_DARLA20 is not set +# CONFIG_SND_GINA20 is not set +# CONFIG_SND_LAYLA20 is not set +# CONFIG_SND_DARLA24 is not set +# CONFIG_SND_GINA24 is not set +# CONFIG_SND_LAYLA24 is not set +# CONFIG_SND_MONA is not set +# CONFIG_SND_MIA is not set +# CONFIG_SND_ECHO3G is not set +# CONFIG_SND_INDIGO is not set +# CONFIG_SND_INDIGOIO is not set +# CONFIG_SND_INDIGODJ is not set +# CONFIG_SND_INDIGOIOX is not set +# CONFIG_SND_INDIGODJX is not set +# CONFIG_SND_EMU10K1 is not set +# CONFIG_SND_EMU10K1X is not set +# CONFIG_SND_ENS1370 is not set +# CONFIG_SND_ENS1371 is not set +# CONFIG_SND_ES1938 is not set +# CONFIG_SND_ES1968 is not set +# CONFIG_SND_FM801 is not set +# CONFIG_SND_HDSP is not set +# CONFIG_SND_HDSPM is not set +# CONFIG_SND_ICE1712 is not set +# CONFIG_SND_ICE1724 is not set +# CONFIG_SND_INTEL8X0 is not set +# CONFIG_SND_INTEL8X0M is not set +# CONFIG_SND_KORG1212 is not set +# CONFIG_SND_LOLA is not set +# CONFIG_SND_LX6464ES is not set +# CONFIG_SND_MAESTRO3 is not set +# CONFIG_SND_MIXART is not set +# CONFIG_SND_NM256 is not set +# CONFIG_SND_PCXHR is not set +# CONFIG_SND_RIPTIDE is not set +# CONFIG_SND_RME32 is not set +# CONFIG_SND_RME96 is not set +# CONFIG_SND_RME9652 is not set +# CONFIG_SND_SE6X is not set +# CONFIG_SND_SONICVIBES is not set +# CONFIG_SND_TRIDENT is not set +# CONFIG_SND_VIA82XX is not set +# CONFIG_SND_VIA82XX_MODEM is not set +# CONFIG_SND_VIRTUOSO is not set +# CONFIG_SND_VX222 is not set +# CONFIG_SND_YMFPCI is not set + +# +# HD-Audio +# +# CONFIG_SND_HDA_INTEL is not set +# end of HD-Audio + +CONFIG_SND_HDA_PREALLOC_SIZE=2048 +CONFIG_SND_SPI=y +CONFIG_SND_USB=y +CONFIG_SND_USB_AUDIO=m +CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y +CONFIG_SND_USB_UA101=m +CONFIG_SND_USB_CAIAQ=m +CONFIG_SND_USB_CAIAQ_INPUT=y +CONFIG_SND_USB_6FIRE=m +CONFIG_SND_USB_HIFACE=m +CONFIG_SND_BCD2000=m +CONFIG_SND_USB_LINE6=m +CONFIG_SND_USB_POD=m +CONFIG_SND_USB_PODHD=m +CONFIG_SND_USB_TONEPORT=m +CONFIG_SND_USB_VARIAX=m +CONFIG_SND_SOC=y +CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y +CONFIG_SND_SOC_COMPRESS=y +# CONFIG_SND_SOC_ADI is not set +# CONFIG_SND_SOC_AMD_ACP is not set +# CONFIG_SND_AMD_ACP_CONFIG is not set +# CONFIG_SND_ATMEL_SOC is not set +CONFIG_SND_BCM2835_SOC_I2S=m +# CONFIG_SND_BCM63XX_I2S_WHISTLER is not set +CONFIG_SND_BCM2708_SOC_CHIPDIP_DAC=m +CONFIG_SND_BCM2708_SOC_GOOGLEVOICEHAT_SOUNDCARD=m +CONFIG_SND_BCM2708_SOC_HIFIBERRY_DAC=m +CONFIG_SND_BCM2708_SOC_HIFIBERRY_DACPLUS=m +CONFIG_SND_BCM2708_SOC_HIFIBERRY_DACPLUSHD=m +CONFIG_SND_BCM2708_SOC_HIFIBERRY_DACPLUSADC=m +CONFIG_SND_BCM2708_SOC_HIFIBERRY_DACPLUSADCPRO=m +CONFIG_SND_BCM2708_SOC_HIFIBERRY_DACPLUSDSP=m +CONFIG_SND_BCM2708_SOC_HIFIBERRY_DIGI=m +CONFIG_SND_BCM2708_SOC_HIFIBERRY_AMP=m +CONFIG_SND_BCM2708_SOC_PIFI_40=m +CONFIG_SND_BCM2708_SOC_RPI_CIRRUS=m +CONFIG_SND_BCM2708_SOC_RPI_DAC=m +CONFIG_SND_BCM2708_SOC_RPI_PROTO=m +CONFIG_SND_BCM2708_SOC_JUSTBOOM_BOTH=m +CONFIG_SND_BCM2708_SOC_JUSTBOOM_DAC=m +CONFIG_SND_BCM2708_SOC_JUSTBOOM_DIGI=m +CONFIG_SND_BCM2708_SOC_IQAUDIO_CODEC=m +CONFIG_SND_BCM2708_SOC_IQAUDIO_DAC=m +CONFIG_SND_BCM2708_SOC_IQAUDIO_DIGI=m +CONFIG_SND_BCM2708_SOC_I_SABRE_Q2M=m +CONFIG_SND_BCM2708_SOC_ADAU1977_ADC=m +CONFIG_SND_AUDIOINJECTOR_PI_SOUNDCARD=m +CONFIG_SND_AUDIOINJECTOR_OCTO_SOUNDCARD=m +CONFIG_SND_AUDIOINJECTOR_ISOLATED_SOUNDCARD=m +CONFIG_SND_AUDIOSENSE_PI=m +CONFIG_SND_DIGIDAC1_SOUNDCARD=m +CONFIG_SND_BCM2708_SOC_DIONAUDIO_LOCO=m +CONFIG_SND_BCM2708_SOC_DIONAUDIO_LOCO_V2=m +CONFIG_SND_BCM2708_SOC_ALLO_PIANO_DAC=m +CONFIG_SND_BCM2708_SOC_ALLO_PIANO_DAC_PLUS=m +CONFIG_SND_BCM2708_SOC_ALLO_BOSS_DAC=m +CONFIG_SND_BCM2708_SOC_ALLO_BOSS2_DAC=m +CONFIG_SND_BCM2708_SOC_ALLO_DIGIONE=m +CONFIG_SND_BCM2708_SOC_ALLO_KATANA_DAC=m +CONFIG_SND_BCM2708_SOC_FE_PI_AUDIO=m +CONFIG_SND_PISOUND=m +CONFIG_SND_RPI_SIMPLE_SOUNDCARD=m +CONFIG_SND_RPI_WM8804_SOUNDCARD=m +CONFIG_SND_DACBERRY400=m +CONFIG_SND_DESIGNWARE_I2S=m +CONFIG_SND_DESIGNWARE_PCM=y + +# +# SoC Audio for Freescale CPUs +# + +# +# Common SoC Audio options for Freescale CPUs: +# +# CONFIG_SND_SOC_FSL_ASRC is not set +# CONFIG_SND_SOC_FSL_SAI is not set +# CONFIG_SND_SOC_FSL_AUDMIX is not set +# CONFIG_SND_SOC_FSL_SSI is not set +# CONFIG_SND_SOC_FSL_SPDIF is not set +# CONFIG_SND_SOC_FSL_ESAI is not set +# CONFIG_SND_SOC_FSL_MICFIL is not set +# CONFIG_SND_SOC_FSL_XCVR is not set +# CONFIG_SND_SOC_IMX_AUDMUX is not set +# end of SoC Audio for Freescale CPUs + +# CONFIG_SND_I2S_HI6210_I2S is not set +# CONFIG_SND_SOC_IMG is not set +# CONFIG_SND_SOC_MTK_BTCVSD is not set +# CONFIG_SND_SOC_SOF_TOPLEVEL is not set + +# +# STMicroelectronics STM32 SOC audio support +# +# end of STMicroelectronics STM32 SOC audio support + +# CONFIG_SND_SOC_XILINX_I2S is not set +# CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER is not set +# CONFIG_SND_SOC_XILINX_SPDIF is not set +# CONFIG_SND_SOC_XTFPGA_I2S is not set +CONFIG_SND_SOC_I2C_AND_SPI=y + +# +# CODEC drivers +# +CONFIG_SND_SOC_ARIZONA=m +CONFIG_SND_SOC_WM_ADSP=m +# CONFIG_SND_SOC_AC97_CODEC is not set +# CONFIG_SND_SOC_AD193X_SPI is not set +# CONFIG_SND_SOC_AD193X_I2C is not set +# CONFIG_SND_SOC_ADAU1372_I2C is not set +# CONFIG_SND_SOC_ADAU1372_SPI is not set +CONFIG_SND_SOC_ADAU1701=m +# CONFIG_SND_SOC_ADAU1761_I2C is not set +# CONFIG_SND_SOC_ADAU1761_SPI is not set +CONFIG_SND_SOC_ADAU1977=m +CONFIG_SND_SOC_ADAU1977_I2C=m +CONFIG_SND_SOC_ADAU7002=m +# CONFIG_SND_SOC_ADAU7118_HW is not set +# CONFIG_SND_SOC_ADAU7118_I2C is not set +# CONFIG_SND_SOC_AK4104 is not set +# CONFIG_SND_SOC_AK4118 is not set +# CONFIG_SND_SOC_AK4375 is not set +# CONFIG_SND_SOC_AK4458 is not set +CONFIG_SND_SOC_AK4554=m +# CONFIG_SND_SOC_AK4613 is not set +# CONFIG_SND_SOC_AK4642 is not set +# CONFIG_SND_SOC_AK5386 is not set +# CONFIG_SND_SOC_AK5558 is not set +# CONFIG_SND_SOC_ALC5623 is not set +# CONFIG_SND_SOC_AW8738 is not set +# CONFIG_SND_SOC_BD28623 is not set +# CONFIG_SND_SOC_BT_SCO is not set +# CONFIG_SND_SOC_CS35L32 is not set +# CONFIG_SND_SOC_CS35L33 is not set +# CONFIG_SND_SOC_CS35L34 is not set +# CONFIG_SND_SOC_CS35L35 is not set +# CONFIG_SND_SOC_CS35L36 is not set +# CONFIG_SND_SOC_CS35L41_SPI is not set +# CONFIG_SND_SOC_CS35L41_I2C is not set +# CONFIG_SND_SOC_CS35L45_SPI is not set +# CONFIG_SND_SOC_CS35L45_I2C is not set +# CONFIG_SND_SOC_CS42L42 is not set +# CONFIG_SND_SOC_CS42L51_I2C is not set +# CONFIG_SND_SOC_CS42L52 is not set +# CONFIG_SND_SOC_CS42L56 is not set +# CONFIG_SND_SOC_CS42L73 is not set +# CONFIG_SND_SOC_CS42L83 is not set +# CONFIG_SND_SOC_CS4234 is not set +CONFIG_SND_SOC_CS4265=m +# CONFIG_SND_SOC_CS4270 is not set +CONFIG_SND_SOC_CS4271=m +CONFIG_SND_SOC_CS4271_I2C=m +# CONFIG_SND_SOC_CS4271_SPI is not set +CONFIG_SND_SOC_CS42XX8=m +CONFIG_SND_SOC_CS42XX8_I2C=m +# CONFIG_SND_SOC_CS43130 is not set +# CONFIG_SND_SOC_CS4341 is not set +# CONFIG_SND_SOC_CS4349 is not set +# CONFIG_SND_SOC_CS53L30 is not set +# CONFIG_SND_SOC_CX2072X is not set +CONFIG_SND_SOC_DA7213=m +CONFIG_SND_SOC_DMIC=m +CONFIG_SND_SOC_HDMI_CODEC=y +# CONFIG_SND_SOC_ES7134 is not set +# CONFIG_SND_SOC_ES7241 is not set +# CONFIG_SND_SOC_ES8316 is not set +# CONFIG_SND_SOC_ES8326 is not set +# CONFIG_SND_SOC_ES8328_I2C is not set +# CONFIG_SND_SOC_ES8328_SPI is not set +# CONFIG_SND_SOC_GTM601 is not set +# CONFIG_SND_SOC_HDA is not set +# CONFIG_SND_SOC_ICS43432 is not set +# CONFIG_SND_SOC_INNO_RK3036 is not set +CONFIG_SND_SOC_MA120X0P=m +# CONFIG_SND_SOC_MAX98088 is not set +# CONFIG_SND_SOC_MAX98357A is not set +# CONFIG_SND_SOC_MAX98504 is not set +# CONFIG_SND_SOC_MAX9867 is not set +# CONFIG_SND_SOC_MAX98927 is not set +# CONFIG_SND_SOC_MAX98520 is not set +# CONFIG_SND_SOC_MAX98373_I2C is not set +# CONFIG_SND_SOC_MAX98390 is not set +# CONFIG_SND_SOC_MAX98396 is not set +# CONFIG_SND_SOC_MAX9860 is not set +# CONFIG_SND_SOC_MSM8916_WCD_DIGITAL is not set +# CONFIG_SND_SOC_PCM1681 is not set +# CONFIG_SND_SOC_PCM1789_I2C is not set +CONFIG_SND_SOC_PCM179X=m +CONFIG_SND_SOC_PCM179X_I2C=m +# CONFIG_SND_SOC_PCM179X_SPI is not set +CONFIG_SND_SOC_PCM186X=m +CONFIG_SND_SOC_PCM186X_I2C=m +# CONFIG_SND_SOC_PCM186X_SPI is not set +# CONFIG_SND_SOC_PCM3060_I2C is not set +# CONFIG_SND_SOC_PCM3060_SPI is not set +# CONFIG_SND_SOC_PCM3168A_I2C is not set +# CONFIG_SND_SOC_PCM3168A_SPI is not set +CONFIG_SND_SOC_PCM5102A=m +CONFIG_SND_SOC_PCM512x=m +CONFIG_SND_SOC_PCM512x_I2C=m +# CONFIG_SND_SOC_PCM512x_SPI is not set +# CONFIG_SND_SOC_RK3328 is not set +# CONFIG_SND_SOC_RT5616 is not set +CONFIG_SND_SOC_PCM1794A=m +# CONFIG_SND_SOC_RT5631 is not set +# CONFIG_SND_SOC_RT5640 is not set +# CONFIG_SND_SOC_RT5659 is not set +# CONFIG_SND_SOC_RT9120 is not set +CONFIG_SND_SOC_SGTL5000=m +CONFIG_SND_SOC_SIGMADSP=m +CONFIG_SND_SOC_SIGMADSP_I2C=m +CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m +# CONFIG_SND_SOC_SIMPLE_MUX is not set +CONFIG_SND_SOC_SPDIF=m +# CONFIG_SND_SOC_SRC4XXX_I2C is not set +# CONFIG_SND_SOC_SSM2305 is not set +# CONFIG_SND_SOC_SSM2518 is not set +# CONFIG_SND_SOC_SSM2602_SPI is not set +# CONFIG_SND_SOC_SSM2602_I2C is not set +# CONFIG_SND_SOC_SSM4567 is not set +CONFIG_SND_SOC_STA32X=m +# CONFIG_SND_SOC_STA350 is not set +# CONFIG_SND_SOC_STI_SAS is not set +# CONFIG_SND_SOC_TAS2552 is not set +# CONFIG_SND_SOC_TAS2562 is not set +# CONFIG_SND_SOC_TAS2764 is not set +# CONFIG_SND_SOC_TAS2770 is not set +# CONFIG_SND_SOC_TAS2780 is not set +# CONFIG_SND_SOC_TAS5086 is not set +CONFIG_SND_SOC_TAS571X=m +# CONFIG_SND_SOC_TAS5720 is not set +# CONFIG_SND_SOC_TAS5805M is not set +# CONFIG_SND_SOC_TAS6424 is not set +# CONFIG_SND_SOC_TDA7419 is not set +# CONFIG_SND_SOC_TFA9879 is not set +CONFIG_SND_SOC_TAS5713=m +# CONFIG_SND_SOC_TFA989X is not set +# CONFIG_SND_SOC_TLV320ADC3XXX is not set +# CONFIG_SND_SOC_TLV320AIC23_I2C is not set +# CONFIG_SND_SOC_TLV320AIC23_SPI is not set +# CONFIG_SND_SOC_TLV320AIC31XX is not set +CONFIG_SND_SOC_TLV320AIC32X4=m +CONFIG_SND_SOC_TLV320AIC32X4_I2C=m +# CONFIG_SND_SOC_TLV320AIC32X4_SPI is not set +CONFIG_SND_SOC_TLV320AIC3X=m +CONFIG_SND_SOC_TLV320AIC3X_I2C=m +# CONFIG_SND_SOC_TLV320AIC3X_SPI is not set +# CONFIG_SND_SOC_TLV320ADCX140 is not set +# CONFIG_SND_SOC_TS3A227E is not set +# CONFIG_SND_SOC_TSCS42XX is not set +# CONFIG_SND_SOC_TSCS454 is not set +# CONFIG_SND_SOC_UDA1334 is not set +CONFIG_SND_SOC_WM5102=m +# CONFIG_SND_SOC_WM8510 is not set +# CONFIG_SND_SOC_WM8523 is not set +# CONFIG_SND_SOC_WM8524 is not set +# CONFIG_SND_SOC_WM8580 is not set +# CONFIG_SND_SOC_WM8711 is not set +# CONFIG_SND_SOC_WM8728 is not set +CONFIG_SND_SOC_WM8731=m +CONFIG_SND_SOC_WM8731_I2C=m +# CONFIG_SND_SOC_WM8731_SPI is not set +# CONFIG_SND_SOC_WM8737 is not set +CONFIG_SND_SOC_WM8741=m +# CONFIG_SND_SOC_WM8750 is not set +# CONFIG_SND_SOC_WM8753 is not set +# CONFIG_SND_SOC_WM8770 is not set +# CONFIG_SND_SOC_WM8776 is not set +# CONFIG_SND_SOC_WM8782 is not set +CONFIG_SND_SOC_WM8804=m +CONFIG_SND_SOC_WM8804_I2C=m +# CONFIG_SND_SOC_WM8804_SPI is not set +# CONFIG_SND_SOC_WM8903 is not set +# CONFIG_SND_SOC_WM8904 is not set +# CONFIG_SND_SOC_WM8940 is not set +CONFIG_SND_SOC_WM8960=m +# CONFIG_SND_SOC_WM8962 is not set +# CONFIG_SND_SOC_WM8974 is not set +# CONFIG_SND_SOC_WM8978 is not set +# CONFIG_SND_SOC_WM8985 is not set +# CONFIG_SND_SOC_ZL38060 is not set +# CONFIG_SND_SOC_MAX9759 is not set +# CONFIG_SND_SOC_MT6351 is not set +# CONFIG_SND_SOC_MT6358 is not set +# CONFIG_SND_SOC_MT6660 is not set +# CONFIG_SND_SOC_NAU8315 is not set +# CONFIG_SND_SOC_NAU8540 is not set +# CONFIG_SND_SOC_NAU8810 is not set +# CONFIG_SND_SOC_NAU8821 is not set +# CONFIG_SND_SOC_NAU8822 is not set +# CONFIG_SND_SOC_NAU8824 is not set +CONFIG_SND_SOC_TPA6130A2=m +# CONFIG_SND_SOC_LPASS_WSA_MACRO is not set +# CONFIG_SND_SOC_LPASS_VA_MACRO is not set +# CONFIG_SND_SOC_LPASS_RX_MACRO is not set +# CONFIG_SND_SOC_LPASS_TX_MACRO is not set +CONFIG_SND_SOC_I_SABRE_CODEC=m +# end of CODEC drivers + +CONFIG_SND_SIMPLE_CARD_UTILS=m +CONFIG_SND_SIMPLE_CARD=m +CONFIG_SND_AUDIO_GRAPH_CARD=m +# CONFIG_SND_AUDIO_GRAPH_CARD2 is not set +# CONFIG_SND_TEST_COMPONENT is not set + +# +# HID support +# +CONFIG_HID=y +# CONFIG_HID_BATTERY_STRENGTH is not set +CONFIG_HIDRAW=y +CONFIG_UHID=y +CONFIG_HID_GENERIC=y + +# +# Special HID drivers +# +CONFIG_HID_A4TECH=y +# CONFIG_HID_ACCUTOUCH is not set +# CONFIG_HID_ACRUX is not set +CONFIG_HID_APPLE=y +# CONFIG_HID_APPLEIR is not set +CONFIG_HID_ASUS=y +CONFIG_HID_AUREAL=y +CONFIG_HID_BELKIN=y +# CONFIG_HID_BETOP_FF is not set +CONFIG_HID_BIGBEN_FF=m +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +# CONFIG_HID_CORSAIR is not set +# CONFIG_HID_COUGAR is not set +# CONFIG_HID_MACALLY is not set +# CONFIG_HID_PRODIKEYS is not set +# CONFIG_HID_CMEDIA is not set +# CONFIG_HID_CP2112 is not set +# CONFIG_HID_CREATIVE_SB0540 is not set +CONFIG_HID_CYPRESS=y +CONFIG_HID_DRAGONRISE=m +CONFIG_DRAGONRISE_FF=y +# CONFIG_HID_EMS_FF is not set +# CONFIG_HID_ELAN is not set +# CONFIG_HID_ELECOM is not set +# CONFIG_HID_ELO is not set +CONFIG_HID_EZKEY=y +# CONFIG_HID_FT260 is not set +# CONFIG_HID_GEMBIRD is not set +# CONFIG_HID_GFRM is not set +# CONFIG_HID_GLORIOUS is not set +# CONFIG_HID_HOLTEK is not set +# CONFIG_HID_VIVALDI is not set +# CONFIG_HID_GT683R is not set +# CONFIG_HID_KEYTOUCH is not set +CONFIG_HID_KYE=y +# CONFIG_HID_UCLOGIC is not set +# CONFIG_HID_WALTOP is not set +# CONFIG_HID_VIEWSONIC is not set +# CONFIG_HID_VRC2 is not set +# CONFIG_HID_XIAOMI is not set +CONFIG_HID_GYRATION=y +# CONFIG_HID_ICADE is not set +# CONFIG_HID_ITE is not set +# CONFIG_HID_JABRA is not set +CONFIG_HID_TWINHAN=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_LCPOWER=y +# CONFIG_HID_LED is not set +CONFIG_HID_LENOVO=y +# CONFIG_HID_LETSKETCH is not set +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_HID_LOGITECH_HIDPP=y +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIWHEELS_FF=y +# CONFIG_HID_MAGICMOUSE is not set +# CONFIG_HID_MALTRON is not set +# CONFIG_HID_MAYFLASH is not set +# CONFIG_HID_MEGAWORLD_FF is not set +# CONFIG_HID_REDRAGON is not set +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y +CONFIG_HID_MULTITOUCH=m +CONFIG_HID_NINTENDO=m +CONFIG_NINTENDO_FF=y +# CONFIG_HID_NTI is not set +# CONFIG_HID_NTRIG is not set +CONFIG_HID_ORTEK=y +CONFIG_HID_OUYA=y +CONFIG_HID_PANTHERLORD=y +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PENMOUNT=y +CONFIG_HID_PETALYNX=y +# CONFIG_HID_PICOLCD is not set +# CONFIG_HID_PLANTRONICS is not set +# CONFIG_HID_PXRC is not set +# CONFIG_HID_RAZER is not set +# CONFIG_HID_PRIMAX is not set +# CONFIG_HID_RETRODE is not set +# CONFIG_HID_ROCCAT is not set +# CONFIG_HID_SAITEK is not set +CONFIG_HID_SAMSUNG=y +# CONFIG_HID_SEMITEK is not set +# CONFIG_HID_SIGMAMICRO is not set +CONFIG_HID_SONY=y +CONFIG_SONY_FF=y +# CONFIG_HID_SPEEDLINK is not set +CONFIG_HID_STEAM=m +# CONFIG_HID_STEELSERIES is not set +CONFIG_HID_SUNPLUS=y +CONFIG_HID_RMI=y +# CONFIG_HID_GREENASIA is not set +CONFIG_HID_SMARTJOYPLUS=m +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=y +CONFIG_HID_TOPSEED=y +# CONFIG_HID_TOPRE is not set +# CONFIG_HID_THINGM is not set +# CONFIG_HID_THRUSTMASTER is not set +# CONFIG_HID_UDRAW_PS3 is not set +# CONFIG_HID_U2FZERO is not set +# CONFIG_HID_WACOM is not set +CONFIG_HID_WIIMOTE=m +CONFIG_HID_XINMO=y +# CONFIG_HID_ZEROPLUS is not set +CONFIG_HID_ZYDACRON=y +# CONFIG_HID_SENSOR_HUB is not set +# CONFIG_HID_ALPS is not set +# CONFIG_HID_MCP2221 is not set +# end of Special HID drivers + +# +# USB HID support +# +CONFIG_USB_HID=y +# CONFIG_HID_PID is not set +CONFIG_USB_HIDDEV=y +# end of USB HID support + +# +# I2C HID support +# +# CONFIG_I2C_HID_OF is not set +# CONFIG_I2C_HID_OF_ELAN is not set +# CONFIG_I2C_HID_OF_GOODIX is not set +# end of I2C HID support +# end of HID support + +CONFIG_USB_OHCI_LITTLE_ENDIAN=y +CONFIG_USB_SUPPORT=y +CONFIG_USB_COMMON=y +# CONFIG_USB_LED_TRIG is not set +# CONFIG_USB_ULPI_BUS is not set +# CONFIG_USB_CONN_GPIO is not set +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB=y +CONFIG_USB_PCI=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y + +# +# Miscellaneous USB options +# +CONFIG_USB_DEFAULT_PERSIST=y +# CONFIG_USB_FEW_INIT_RETRIES is not set +# CONFIG_USB_DYNAMIC_MINORS is not set +# CONFIG_USB_OTG is not set +# CONFIG_USB_OTG_PRODUCTLIST is not set +# CONFIG_USB_OTG_DISABLE_EXTERNAL_HUB is not set +# CONFIG_USB_LEDS_TRIGGER_USBPORT is not set +CONFIG_USB_AUTOSUSPEND_DELAY=2 +CONFIG_USB_MON=m + +# +# USB Host Controller Drivers +# +# CONFIG_USB_C67X00_HCD is not set +CONFIG_USB_XHCI_HCD=y +# CONFIG_USB_XHCI_DBGCAP is not set +CONFIG_USB_XHCI_PCI=y +# CONFIG_USB_XHCI_PCI_RENESAS is not set +CONFIG_USB_XHCI_PLATFORM=y +# CONFIG_USB_BRCMSTB is not set +# CONFIG_USB_EHCI_HCD is not set +# CONFIG_USB_OXU210HP_HCD is not set +# CONFIG_USB_ISP116X_HCD is not set +# CONFIG_USB_FOTG210_HCD is not set +# CONFIG_USB_MAX3421_HCD is not set +# CONFIG_USB_OHCI_HCD is not set +# CONFIG_USB_UHCI_HCD is not set +# CONFIG_USB_SL811_HCD is not set +# CONFIG_USB_R8A66597_HCD is not set +CONFIG_USB_DWCOTG=y +# CONFIG_USB_HCD_BCMA is not set +# CONFIG_USB_HCD_SSB is not set +# CONFIG_USB_HCD_TEST_MODE is not set + +# +# USB Device Class drivers +# +CONFIG_USB_ACM=m +# CONFIG_USB_PRINTER is not set +# CONFIG_USB_WDM is not set +# CONFIG_USB_TMC is not set + +# +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may +# + +# +# also be needed; see USB_STORAGE Help for more info +# +CONFIG_USB_STORAGE=y +# CONFIG_USB_STORAGE_DEBUG is not set +# CONFIG_USB_STORAGE_REALTEK is not set +# CONFIG_USB_STORAGE_DATAFAB is not set +# CONFIG_USB_STORAGE_FREECOM is not set +# CONFIG_USB_STORAGE_ISD200 is not set +# CONFIG_USB_STORAGE_USBAT is not set +# CONFIG_USB_STORAGE_SDDR09 is not set +# CONFIG_USB_STORAGE_SDDR55 is not set +# CONFIG_USB_STORAGE_JUMPSHOT is not set +# CONFIG_USB_STORAGE_ALAUDA is not set +# CONFIG_USB_STORAGE_ONETOUCH is not set +# CONFIG_USB_STORAGE_KARMA is not set +# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set +# CONFIG_USB_STORAGE_ENE_UB6250 is not set +CONFIG_USB_UAS=y + +# +# USB Imaging devices +# +# CONFIG_USB_MDC800 is not set +# CONFIG_USB_MICROTEK is not set +# CONFIG_USBIP_CORE is not set +# CONFIG_USB_CDNS_SUPPORT is not set +# CONFIG_USB_MUSB_HDRC is not set +CONFIG_USB_DWC3=y +CONFIG_USB_DWC3_HOST=y + +# +# Platform Glue Driver Support +# +CONFIG_USB_DWC3_HAPS=y +CONFIG_USB_DWC3_OF_SIMPLE=y +CONFIG_USB_DWC2=y +CONFIG_USB_DWC2_HOST=y + +# +# Gadget/Dual-role mode requires USB Gadget support to be enabled +# +# CONFIG_USB_DWC2_PCI is not set +# CONFIG_USB_DWC2_DEBUG is not set +# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set +# CONFIG_USB_ISP1760 is not set + +# +# USB port drivers +# +CONFIG_USB_SERIAL=m +CONFIG_USB_SERIAL_GENERIC=y +# CONFIG_USB_SERIAL_SIMPLE is not set +# CONFIG_USB_SERIAL_AIRCABLE is not set +# CONFIG_USB_SERIAL_ARK3116 is not set +# CONFIG_USB_SERIAL_BELKIN is not set +CONFIG_USB_SERIAL_CH341=m +# CONFIG_USB_SERIAL_WHITEHEAT is not set +# CONFIG_USB_SERIAL_DIGI_ACCELEPORT is not set +CONFIG_USB_SERIAL_CP210X=m +# CONFIG_USB_SERIAL_CYPRESS_M8 is not set +# CONFIG_USB_SERIAL_EMPEG is not set +CONFIG_USB_SERIAL_FTDI_SIO=m +# CONFIG_USB_SERIAL_VISOR is not set +# CONFIG_USB_SERIAL_IPAQ is not set +# CONFIG_USB_SERIAL_IR is not set +# CONFIG_USB_SERIAL_EDGEPORT is not set +# CONFIG_USB_SERIAL_EDGEPORT_TI is not set +# CONFIG_USB_SERIAL_F81232 is not set +# CONFIG_USB_SERIAL_F8153X is not set +# CONFIG_USB_SERIAL_GARMIN is not set +# CONFIG_USB_SERIAL_IPW is not set +CONFIG_USB_SERIAL_IUU=m +# CONFIG_USB_SERIAL_KEYSPAN_PDA is not set +# CONFIG_USB_SERIAL_KEYSPAN is not set +# CONFIG_USB_SERIAL_KLSI is not set +# CONFIG_USB_SERIAL_KOBIL_SCT is not set +# CONFIG_USB_SERIAL_MCT_U232 is not set +# CONFIG_USB_SERIAL_METRO is not set +# CONFIG_USB_SERIAL_MOS7720 is not set +# CONFIG_USB_SERIAL_MOS7840 is not set +# CONFIG_USB_SERIAL_MXUPORT is not set +# CONFIG_USB_SERIAL_NAVMAN is not set +CONFIG_USB_SERIAL_PL2303=m +# CONFIG_USB_SERIAL_OTI6858 is not set +# CONFIG_USB_SERIAL_QCAUX is not set +# CONFIG_USB_SERIAL_QUALCOMM is not set +# CONFIG_USB_SERIAL_SPCP8X5 is not set +# CONFIG_USB_SERIAL_SAFE is not set +# CONFIG_USB_SERIAL_SIERRAWIRELESS is not set +# CONFIG_USB_SERIAL_SYMBOL is not set +# CONFIG_USB_SERIAL_TI is not set +# CONFIG_USB_SERIAL_CYBERJACK is not set +# CONFIG_USB_SERIAL_OPTION is not set +# CONFIG_USB_SERIAL_OMNINET is not set +# CONFIG_USB_SERIAL_OPTICON is not set +# CONFIG_USB_SERIAL_XSENS_MT is not set +# CONFIG_USB_SERIAL_WISHBONE is not set +# CONFIG_USB_SERIAL_SSU100 is not set +# CONFIG_USB_SERIAL_QT2 is not set +# CONFIG_USB_SERIAL_UPD78F0730 is not set +# CONFIG_USB_SERIAL_XR is not set +# CONFIG_USB_SERIAL_DEBUG is not set + +# +# USB Miscellaneous drivers +# +# CONFIG_USB_EMI62 is not set +# CONFIG_USB_EMI26 is not set +# CONFIG_USB_ADUTUX is not set +# CONFIG_USB_SEVSEG is not set +# CONFIG_USB_LEGOTOWER is not set +# CONFIG_USB_LCD is not set +# CONFIG_USB_CYPRESS_CY7C63 is not set +# CONFIG_USB_CYTHERM is not set +# CONFIG_USB_IDMOUSE is not set +# CONFIG_USB_FTDI_ELAN is not set +# CONFIG_USB_APPLEDISPLAY is not set +# CONFIG_APPLE_MFI_FASTCHARGE is not set +# CONFIG_USB_LD is not set +# CONFIG_USB_TRANCEVIBRATOR is not set +# CONFIG_USB_IOWARRIOR is not set +# CONFIG_USB_TEST is not set +# CONFIG_USB_EHSET_TEST_FIXTURE is not set +# CONFIG_USB_ISIGHTFW is not set +# CONFIG_USB_YUREX is not set +# CONFIG_USB_EZUSB_FX2 is not set +# CONFIG_USB_HUB_USB251XB is not set +# CONFIG_USB_HSIC_USB3503 is not set +# CONFIG_USB_HSIC_USB4604 is not set +# CONFIG_USB_LINK_LAYER_TEST is not set +# CONFIG_USB_CHAOSKEY is not set +CONFIG_BRCM_USB_PINMAP=y +# CONFIG_USB_ONBOARD_HUB is not set + +# +# USB Physical Layer drivers +# +CONFIG_USB_PHY=y +CONFIG_NOP_USB_XCEIV=y +# CONFIG_USB_GPIO_VBUS is not set +# CONFIG_USB_ISP1301 is not set +# CONFIG_USB_ULPI is not set +# end of USB Physical Layer drivers + +# CONFIG_USB_GADGET is not set +# CONFIG_TYPEC is not set +CONFIG_USB_ROLE_SWITCH=y +CONFIG_MMC=y +CONFIG_PWRSEQ_EMMC=y +CONFIG_PWRSEQ_SIMPLE=y +CONFIG_MMC_BLOCK=y +CONFIG_MMC_BLOCK_MINORS=32 +# CONFIG_SDIO_UART is not set +# CONFIG_MMC_TEST is not set + +# +# MMC/SD/SDIO Host Controller Drivers +# +# CONFIG_MMC_BCM2835_MMC is not set +CONFIG_MMC_BCM2835_SDHOST=y +# CONFIG_MMC_DEBUG is not set +# CONFIG_MMC_ARMMMCI is not set +CONFIG_MMC_SDHCI=y +CONFIG_MMC_SDHCI_IO_ACCESSORS=y +# CONFIG_MMC_SDHCI_PCI is not set +CONFIG_MMC_SDHCI_PLTFM=y +# CONFIG_MMC_SDHCI_OF_ARASAN is not set +# CONFIG_MMC_SDHCI_OF_AT91 is not set +CONFIG_MMC_SDHCI_OF_DWCMSHC=m +# CONFIG_MMC_SDHCI_CADENCE is not set +# CONFIG_MMC_SDHCI_F_SDH30 is not set +# CONFIG_MMC_SDHCI_MILBEAUT is not set +CONFIG_MMC_SDHCI_IPROC=y +# CONFIG_MMC_TIFM_SD is not set +# CONFIG_MMC_SPI is not set +# CONFIG_MMC_CB710 is not set +# CONFIG_MMC_VIA_SDMMC is not set +# CONFIG_MMC_DW is not set +# CONFIG_MMC_VUB300 is not set +# CONFIG_MMC_USHC is not set +# CONFIG_MMC_USDHI6ROL0 is not set +CONFIG_MMC_REALTEK_USB=m +CONFIG_MMC_CQHCI=y +# CONFIG_MMC_HSQ is not set +# CONFIG_MMC_TOSHIBA_PCI is not set +# CONFIG_MMC_BCM2835 is not set +# CONFIG_MMC_MTK is not set +CONFIG_MMC_SDHCI_BRCMSTB=y +# CONFIG_MMC_SDHCI_XENON is not set +# CONFIG_MMC_SDHCI_OMAP is not set +# CONFIG_MMC_SDHCI_AM654 is not set +# CONFIG_SCSI_UFSHCD is not set +# CONFIG_MEMSTICK is not set +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_CLASS_FLASH=y +# CONFIG_LEDS_CLASS_MULTICOLOR is not set +# CONFIG_LEDS_BRIGHTNESS_HW_CHANGED is not set + +# +# LED drivers +# +# CONFIG_LEDS_AN30259A is not set +# CONFIG_LEDS_AW2013 is not set +# CONFIG_LEDS_BCM6328 is not set +# CONFIG_LEDS_BCM6358 is not set +# CONFIG_LEDS_CR0014114 is not set +# CONFIG_LEDS_EL15203000 is not set +# CONFIG_LEDS_LM3530 is not set +# CONFIG_LEDS_LM3532 is not set +# CONFIG_LEDS_LM3642 is not set +# CONFIG_LEDS_LM3692X is not set +# CONFIG_LEDS_PCA9532 is not set +CONFIG_LEDS_GPIO=y +# CONFIG_LEDS_LP3944 is not set +# CONFIG_LEDS_LP3952 is not set +# CONFIG_LEDS_LP50XX is not set +# CONFIG_LEDS_LP55XX_COMMON is not set +# CONFIG_LEDS_LP8860 is not set +# CONFIG_LEDS_PCA955X is not set +# CONFIG_LEDS_PCA963X is not set +# CONFIG_LEDS_DAC124S085 is not set +CONFIG_LEDS_PWM=y +# CONFIG_LEDS_REGULATOR is not set +# CONFIG_LEDS_BD2802 is not set +# CONFIG_LEDS_LT3593 is not set +# CONFIG_LEDS_TCA6507 is not set +# CONFIG_LEDS_TLC591XX is not set +# CONFIG_LEDS_LM355x is not set +# CONFIG_LEDS_IS31FL319X is not set +# CONFIG_LEDS_IS31FL32XX is not set + +# +# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) +# +# CONFIG_LEDS_BLINKM is not set +# CONFIG_LEDS_SYSCON is not set +# CONFIG_LEDS_MLXREG is not set +# CONFIG_LEDS_USER is not set +# CONFIG_LEDS_SPI_BYTE is not set +# CONFIG_LEDS_TI_LMU_COMMON is not set + +# +# Flash and Torch LED drivers +# +# CONFIG_LEDS_AAT1290 is not set +# CONFIG_LEDS_AS3645A is not set +# CONFIG_LEDS_KTD2692 is not set +# CONFIG_LEDS_LM3601X is not set +# CONFIG_LEDS_RT4505 is not set +# CONFIG_LEDS_RT8515 is not set +# CONFIG_LEDS_SGM3140 is not set + +# +# RGB LED drivers +# + +# +# LED Triggers +# +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=y +CONFIG_LEDS_TRIGGER_ONESHOT=y +CONFIG_LEDS_TRIGGER_HEARTBEAT=y +CONFIG_LEDS_TRIGGER_BACKLIGHT=y +CONFIG_LEDS_TRIGGER_CPU=y +# CONFIG_LEDS_TRIGGER_ACTIVITY is not set +CONFIG_LEDS_TRIGGER_GPIO=y +CONFIG_LEDS_TRIGGER_DEFAULT_ON=y + +# +# iptables trigger is under Netfilter config (LED target) +# +CONFIG_LEDS_TRIGGER_TRANSIENT=y +CONFIG_LEDS_TRIGGER_CAMERA=y +CONFIG_LEDS_TRIGGER_INPUT=y +# CONFIG_LEDS_TRIGGER_PANIC is not set +# CONFIG_LEDS_TRIGGER_NETDEV is not set +# CONFIG_LEDS_TRIGGER_PATTERN is not set +# CONFIG_LEDS_TRIGGER_AUDIO is not set +# CONFIG_LEDS_TRIGGER_TTY is not set +CONFIG_LEDS_TRIGGER_ACTPWR=y + +# +# Simple LED drivers +# +# CONFIG_ACCESSIBILITY is not set +# CONFIG_INFINIBAND is not set +CONFIG_EDAC_SUPPORT=y +# CONFIG_EDAC is not set +CONFIG_RTC_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +CONFIG_RTC_SYSTOHC=y +CONFIG_RTC_SYSTOHC_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set +CONFIG_RTC_NVMEM=y + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +# CONFIG_RTC_DRV_ABB5ZES3 is not set +# CONFIG_RTC_DRV_ABEOZ9 is not set +CONFIG_RTC_DRV_ABX80X=m +CONFIG_RTC_DRV_RPI=y +CONFIG_RTC_DRV_BRCMSTB=y +CONFIG_RTC_DRV_DS1307=m +# CONFIG_RTC_DRV_DS1307_CENTURY is not set +# CONFIG_RTC_DRV_DS1374 is not set +# CONFIG_RTC_DRV_DS1672 is not set +# CONFIG_RTC_DRV_HYM8563 is not set +# CONFIG_RTC_DRV_MAX6900 is not set +# CONFIG_RTC_DRV_NCT3018Y is not set +# CONFIG_RTC_DRV_RS5C372 is not set +# CONFIG_RTC_DRV_ISL1208 is not set +# CONFIG_RTC_DRV_ISL12022 is not set +# CONFIG_RTC_DRV_ISL12026 is not set +# CONFIG_RTC_DRV_X1205 is not set +CONFIG_RTC_DRV_PCF8523=m +# CONFIG_RTC_DRV_PCF85063 is not set +# CONFIG_RTC_DRV_PCF85363 is not set +CONFIG_RTC_DRV_PCF8563=m +# CONFIG_RTC_DRV_PCF8583 is not set +# CONFIG_RTC_DRV_M41T80 is not set +# CONFIG_RTC_DRV_BQ32K is not set +# CONFIG_RTC_DRV_S35390A is not set +# CONFIG_RTC_DRV_FM3130 is not set +# CONFIG_RTC_DRV_RX8010 is not set +# CONFIG_RTC_DRV_RX8581 is not set +# CONFIG_RTC_DRV_RX8025 is not set +# CONFIG_RTC_DRV_EM3027 is not set +# CONFIG_RTC_DRV_RV3028 is not set +# CONFIG_RTC_DRV_RV3032 is not set +# CONFIG_RTC_DRV_RV8803 is not set +# CONFIG_RTC_DRV_SD3078 is not set + +# +# SPI RTC drivers +# +# CONFIG_RTC_DRV_M41T93 is not set +# CONFIG_RTC_DRV_M41T94 is not set +# CONFIG_RTC_DRV_DS1302 is not set +# CONFIG_RTC_DRV_DS1305 is not set +# CONFIG_RTC_DRV_DS1343 is not set +# CONFIG_RTC_DRV_DS1347 is not set +# CONFIG_RTC_DRV_DS1390 is not set +# CONFIG_RTC_DRV_MAX6916 is not set +# CONFIG_RTC_DRV_R9701 is not set +# CONFIG_RTC_DRV_RX4581 is not set +# CONFIG_RTC_DRV_RS5C348 is not set +# CONFIG_RTC_DRV_MAX6902 is not set +# CONFIG_RTC_DRV_PCF2123 is not set +# CONFIG_RTC_DRV_MCP795 is not set +CONFIG_RTC_I2C_AND_SPI=y + +# +# SPI and I2C RTC drivers +# +CONFIG_RTC_DRV_DS3232=m +CONFIG_RTC_DRV_DS3232_HWMON=y +CONFIG_RTC_DRV_PCF2127=m +# CONFIG_RTC_DRV_RV3029C2 is not set +# CONFIG_RTC_DRV_RX6110 is not set + +# +# Platform RTC drivers +# +# CONFIG_RTC_DRV_DS1286 is not set +# CONFIG_RTC_DRV_DS1511 is not set +# CONFIG_RTC_DRV_DS1553 is not set +# CONFIG_RTC_DRV_DS1685_FAMILY is not set +# CONFIG_RTC_DRV_DS1742 is not set +# CONFIG_RTC_DRV_DS2404 is not set +# CONFIG_RTC_DRV_EFI is not set +# CONFIG_RTC_DRV_STK17TA8 is not set +# CONFIG_RTC_DRV_M48T86 is not set +# CONFIG_RTC_DRV_M48T35 is not set +# CONFIG_RTC_DRV_M48T59 is not set +# CONFIG_RTC_DRV_MSM6242 is not set +# CONFIG_RTC_DRV_BQ4802 is not set +# CONFIG_RTC_DRV_RP5C01 is not set +# CONFIG_RTC_DRV_V3020 is not set +# CONFIG_RTC_DRV_ZYNQMP is not set + +# +# on-CPU RTC drivers +# +# CONFIG_RTC_DRV_PL030 is not set +# CONFIG_RTC_DRV_PL031 is not set +# CONFIG_RTC_DRV_CADENCE is not set +# CONFIG_RTC_DRV_FTRTC010 is not set +# CONFIG_RTC_DRV_R7301 is not set + +# +# HID Sensor RTC drivers +# +# CONFIG_RTC_DRV_GOLDFISH is not set +CONFIG_DMADEVICES=y +# CONFIG_DMADEVICES_DEBUG is not set + +# +# DMA Devices +# +CONFIG_DMA_ENGINE=y +CONFIG_DMA_VIRTUAL_CHANNELS=y +CONFIG_DMA_OF=y +# CONFIG_ALTERA_MSGDMA is not set +# CONFIG_AMBA_PL08X is not set +# CONFIG_BCM_SBA_RAID is not set +CONFIG_DMA_BCM2835=y +CONFIG_DW_AXI_DMAC=y +# CONFIG_FSL_EDMA is not set +# CONFIG_FSL_QDMA is not set +# CONFIG_INTEL_IDMA64 is not set +# CONFIG_MV_XOR_V2 is not set +# CONFIG_PL330_DMA is not set +# CONFIG_PLX_DMA is not set +# CONFIG_DMA_BCM2708 is not set +# CONFIG_XILINX_DMA is not set +# CONFIG_XILINX_ZYNQMP_DMA is not set +# CONFIG_XILINX_ZYNQMP_DPDMA is not set +# CONFIG_QCOM_HIDMA_MGMT is not set +# CONFIG_QCOM_HIDMA is not set +# CONFIG_DW_DMAC is not set +# CONFIG_DW_DMAC_PCI is not set +# CONFIG_DW_EDMA is not set +# CONFIG_DW_EDMA_PCIE is not set +# CONFIG_SF_PDMA is not set + +# +# DMA Clients +# +# CONFIG_ASYNC_TX_DMA is not set +# CONFIG_DMATEST is not set + +# +# DMABUF options +# +CONFIG_SYNC_FILE=y +# CONFIG_SW_SYNC is not set +# CONFIG_UDMABUF is not set +# CONFIG_DMABUF_MOVE_NOTIFY is not set +# CONFIG_DMABUF_DEBUG is not set +# CONFIG_DMABUF_SELFTESTS is not set +CONFIG_DMABUF_HEAPS=y +# CONFIG_DMABUF_SYSFS_STATS is not set +CONFIG_DMABUF_HEAPS_SYSTEM=y +CONFIG_DMABUF_HEAPS_CMA=y +# end of DMABUF options + +# CONFIG_AUXDISPLAY is not set +# CONFIG_UIO is not set +# CONFIG_VFIO is not set +# CONFIG_VIRT_DRIVERS is not set +# CONFIG_VIRTIO_MENU is not set +# CONFIG_VDPA is not set +CONFIG_VHOST_MENU=y +# CONFIG_VHOST_NET is not set +# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set + +# +# Microsoft Hyper-V guest support +# +# end of Microsoft Hyper-V guest support + +# CONFIG_GREYBUS is not set +# CONFIG_COMEDI is not set +CONFIG_STAGING=y +# CONFIG_PRISM2_USB is not set +# CONFIG_RTL8192U is not set +# CONFIG_RTLLIB is not set +CONFIG_RTL8723BS=m +CONFIG_R8712U=m +CONFIG_R8188EU=m +# CONFIG_RTS5208 is not set +# CONFIG_VT6655 is not set +CONFIG_VT6656=m +# CONFIG_FB_SM750 is not set +CONFIG_STAGING_MEDIA=y +# CONFIG_VIDEO_MAX96712 is not set +CONFIG_VIDEO_RPIVID=m +# CONFIG_STAGING_MEDIA_DEPRECATED is not set +# CONFIG_STAGING_BOARD is not set +# CONFIG_LTE_GDM724X is not set +# CONFIG_FB_TFT is not set +# CONFIG_KS7010 is not set +# CONFIG_BCM_VIDEOCORE is not set +# CONFIG_PI433 is not set +# CONFIG_XIL_AXIS_FIFO is not set +# CONFIG_FIELDBUS_DEV is not set +# CONFIG_QLGE is not set +# CONFIG_VME_BUS is not set +# CONFIG_GOLDFISH is not set +# CONFIG_CHROME_PLATFORMS is not set +# CONFIG_MELLANOX_PLATFORM is not set +CONFIG_SURFACE_PLATFORMS=y +CONFIG_HAVE_CLK=y +CONFIG_HAVE_CLK_PREPARE=y +CONFIG_COMMON_CLK=y + +# +# Clock driver for ARM Reference designs +# +# CONFIG_CLK_ICST is not set +# CONFIG_CLK_SP810 is not set +# end of Clock driver for ARM Reference designs + +# CONFIG_LMK04832 is not set +# CONFIG_COMMON_CLK_MAX9485 is not set +CONFIG_COMMON_CLK_RP1=y +CONFIG_COMMON_CLK_RP1_SDIO=y +CONFIG_COMMON_CLK_HIFIBERRY_DACPLUSHD=m +CONFIG_COMMON_CLK_HIFIBERRY_DACPRO=m +# CONFIG_COMMON_CLK_SI5341 is not set +# CONFIG_COMMON_CLK_SI5351 is not set +# CONFIG_COMMON_CLK_SI514 is not set +# CONFIG_COMMON_CLK_SI544 is not set +# CONFIG_COMMON_CLK_SI570 is not set +# CONFIG_COMMON_CLK_CDCE706 is not set +# CONFIG_COMMON_CLK_CDCE925 is not set +# CONFIG_COMMON_CLK_CS2000_CP is not set +# CONFIG_COMMON_CLK_AXI_CLKGEN is not set +# CONFIG_COMMON_CLK_XGENE is not set +# CONFIG_COMMON_CLK_PWM is not set +# CONFIG_COMMON_CLK_RS9_PCIE is not set +# CONFIG_COMMON_CLK_VC5 is not set +# CONFIG_COMMON_CLK_VC7 is not set +# CONFIG_COMMON_CLK_FIXED_MMIO is not set +CONFIG_CLK_BCM2711_DVP=y +CONFIG_CLK_BCM2835=y +CONFIG_CLK_RASPBERRYPI=y +# CONFIG_XILINX_VCU is not set +# CONFIG_COMMON_CLK_XLNX_CLKWZRD is not set +# CONFIG_HWSPINLOCK is not set + +# +# Clock Source drivers +# +CONFIG_TIMER_OF=y +CONFIG_TIMER_PROBE=y +CONFIG_CLKSRC_MMIO=y +CONFIG_ARM_ARCH_TIMER=y +CONFIG_ARM_ARCH_TIMER_EVTSTREAM=y +CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND=y +CONFIG_FSL_ERRATUM_A008585=y +CONFIG_HISILICON_ERRATUM_161010101=y +CONFIG_ARM64_ERRATUM_858921=y +CONFIG_ARM_TIMER_SP804=y +# CONFIG_MICROCHIP_PIT64B is not set +# end of Clock Source drivers + +CONFIG_MAILBOX=y +# CONFIG_ARM_MHU is not set +# CONFIG_ARM_MHU_V2 is not set +# CONFIG_PLATFORM_MHU is not set +# CONFIG_PL320_MBOX is not set +# CONFIG_ALTERA_MBOX is not set +CONFIG_BCM2835_MBOX=y +# CONFIG_MAILBOX_TEST is not set +CONFIG_IOMMU_IOVA=y +CONFIG_IOMMU_API=y +CONFIG_IOMMU_SUPPORT=y + +# +# Generic IOMMU Pagetable Support +# +# CONFIG_IOMMU_IO_PGTABLE_LPAE is not set +# CONFIG_IOMMU_IO_PGTABLE_ARMV7S is not set +# CONFIG_IOMMU_IO_PGTABLE_DART is not set +# end of Generic IOMMU Pagetable Support + +# CONFIG_IOMMU_DEBUGFS is not set +CONFIG_IOMMU_DEFAULT_DMA_STRICT=y +# CONFIG_IOMMU_DEFAULT_DMA_LAZY is not set +# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set +CONFIG_OF_IOMMU=y +CONFIG_IOMMU_DMA=y +# CONFIG_ARM_SMMU is not set +# CONFIG_ARM_SMMU_V3 is not set +CONFIG_BCM2712_IOMMU=y + +# +# Remoteproc drivers +# +# CONFIG_REMOTEPROC is not set +# end of Remoteproc drivers + +# +# Rpmsg drivers +# +# CONFIG_RPMSG_QCOM_GLINK_RPM is not set +# CONFIG_RPMSG_VIRTIO is not set +# end of Rpmsg drivers + +# CONFIG_SOUNDWIRE is not set + +# +# SOC (System On Chip) specific Drivers +# + +# +# Amlogic SoC drivers +# +# end of Amlogic SoC drivers + +# +# Broadcom SoC drivers +# +CONFIG_BCM2835_POWER=y +CONFIG_RASPBERRYPI_POWER=y +CONFIG_SOC_BRCMSTB=y +CONFIG_BRCMSTB_PM=y +# end of Broadcom SoC drivers + +# +# NXP/Freescale QorIQ SoC drivers +# +# CONFIG_QUICC_ENGINE is not set +# end of NXP/Freescale QorIQ SoC drivers + +# +# fujitsu SoC drivers +# +# end of fujitsu SoC drivers + +# +# i.MX SoC drivers +# +# end of i.MX SoC drivers + +# +# Enable LiteX SoC Builder specific drivers +# +# CONFIG_LITEX_SOC_CONTROLLER is not set +# end of Enable LiteX SoC Builder specific drivers + +# +# Qualcomm SoC drivers +# +# end of Qualcomm SoC drivers + +# CONFIG_SOC_TI is not set + +# +# Xilinx SoC drivers +# +# end of Xilinx SoC drivers +# end of SOC (System On Chip) specific Drivers + +# CONFIG_PM_DEVFREQ is not set +CONFIG_EXTCON=y + +# +# Extcon Device Drivers +# +# CONFIG_EXTCON_FSA9480 is not set +# CONFIG_EXTCON_GPIO is not set +# CONFIG_EXTCON_MAX3355 is not set +# CONFIG_EXTCON_PTN5150 is not set +# CONFIG_EXTCON_RT8973A is not set +# CONFIG_EXTCON_SM5502 is not set +# CONFIG_EXTCON_USB_GPIO is not set +# CONFIG_MEMORY is not set +# CONFIG_IIO is not set +# CONFIG_NTB is not set +CONFIG_PWM=y +CONFIG_PWM_SYSFS=y +# CONFIG_PWM_DEBUG is not set +# CONFIG_PWM_ATMEL_TCB is not set +CONFIG_PWM_BCM2835=m +CONFIG_PWM_BRCMSTB=y +# CONFIG_PWM_CLK is not set +# CONFIG_PWM_DWC is not set +# CONFIG_PWM_FSL_FTM is not set +# CONFIG_PWM_PCA9685 is not set +CONFIG_PWM_RASPBERRYPI_POE=m +CONFIG_PWM_RP1=y +# CONFIG_PWM_XILINX is not set + +# +# IRQ chip support +# +CONFIG_IRQCHIP=y +CONFIG_ARM_GIC=y +CONFIG_ARM_GIC_MAX_NR=1 +CONFIG_ARM_GIC_V2M=y +CONFIG_ARM_GIC_V3=y +CONFIG_ARM_GIC_V3_ITS=y +CONFIG_ARM_GIC_V3_ITS_PCI=y +# CONFIG_AL_FIC is not set +CONFIG_BCM2712_MIP=y +CONFIG_BCM7038_L1_IRQ=y +CONFIG_BCM7120_L2_IRQ=y +CONFIG_BRCMSTB_L2_IRQ=y +# CONFIG_XILINX_INTC is not set +CONFIG_PARTITION_PERCPU=y +# end of IRQ chip support + +# CONFIG_IPACK_BUS is not set +CONFIG_ARCH_HAS_RESET_CONTROLLER=y +CONFIG_RESET_CONTROLLER=y +CONFIG_RESET_BRCMSTB=y +CONFIG_RESET_BRCMSTB_RESCAL=y +CONFIG_RESET_RASPBERRYPI=y +CONFIG_RESET_SIMPLE=y +# CONFIG_RESET_TI_SYSCON is not set +# CONFIG_RESET_TI_TPS380X is not set + +# +# PHY Subsystem +# +CONFIG_GENERIC_PHY=y +CONFIG_GENERIC_PHY_MIPI_DPHY=y +# CONFIG_PHY_XGENE is not set +# CONFIG_PHY_CAN_TRANSCEIVER is not set + +# +# PHY drivers for Broadcom platforms +# +# CONFIG_BCM_KONA_USB2_PHY is not set +# CONFIG_PHY_BRCM_SATA is not set +CONFIG_PHY_BRCM_USB=y +# end of PHY drivers for Broadcom platforms + +# CONFIG_PHY_CADENCE_TORRENT is not set +# CONFIG_PHY_CADENCE_DPHY is not set +# CONFIG_PHY_CADENCE_DPHY_RX is not set +# CONFIG_PHY_CADENCE_SIERRA is not set +# CONFIG_PHY_CADENCE_SALVO is not set +# CONFIG_PHY_PXA_28NM_HSIC is not set +# CONFIG_PHY_PXA_28NM_USB2 is not set +# CONFIG_PHY_LAN966X_SERDES is not set +# CONFIG_PHY_MAPPHONE_MDM6600 is not set +# CONFIG_PHY_OCELOT_SERDES is not set +# CONFIG_PHY_SAMSUNG_USB2 is not set +# end of PHY Subsystem + +# CONFIG_POWERCAP is not set +# CONFIG_MCB is not set + +# +# Performance monitor support +# +# CONFIG_ARM_CCI_PMU is not set +# CONFIG_ARM_CCN is not set +# CONFIG_ARM_CMN is not set +CONFIG_ARM_PMU=y +# CONFIG_ARM_DSU_PMU is not set +# CONFIG_ARM_SPE_PMU is not set +CONFIG_RPI_AXIPERF=m +# CONFIG_HISI_PCIE_PMU is not set +# CONFIG_HNS3_PMU is not set +# end of Performance monitor support + +CONFIG_RAS=y +# CONFIG_USB4 is not set + +# +# Android +# +# CONFIG_ANDROID_BINDER_IPC is not set +# end of Android + +# CONFIG_LIBNVDIMM is not set +# CONFIG_DAX is not set +CONFIG_NVMEM=y +CONFIG_NVMEM_SYSFS=y +CONFIG_NVMEM_RMEM=m + +# +# HW tracing support +# +# CONFIG_STM is not set +# CONFIG_INTEL_TH is not set +# CONFIG_HISI_PTT is not set +# end of HW tracing support + +# CONFIG_FPGA is not set +# CONFIG_FSI is not set +# CONFIG_TEE is not set +CONFIG_PM_OPP=y +# CONFIG_SIOX is not set +# CONFIG_SLIMBUS is not set +# CONFIG_INTERCONNECT is not set +# CONFIG_COUNTER is not set +# CONFIG_MOST is not set +# CONFIG_PECI is not set +# CONFIG_HTE is not set +# end of Device Drivers + +# +# File systems +# +CONFIG_DCACHE_WORD_ACCESS=y +CONFIG_VALIDATE_FS_PARSER=y +CONFIG_FS_IOMAP=y +# CONFIG_EXT2_FS is not set +# CONFIG_EXT3_FS is not set +CONFIG_EXT4_FS=y +CONFIG_EXT4_USE_FOR_EXT2=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +# CONFIG_EXT4_DEBUG is not set +CONFIG_JBD2=y +# CONFIG_JBD2_DEBUG is not set +CONFIG_FS_MBCACHE=y +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +# CONFIG_REISERFS_PROC_INFO is not set +# CONFIG_REISERFS_FS_XATTR is not set +CONFIG_JFS_FS=m +# CONFIG_JFS_POSIX_ACL is not set +# CONFIG_JFS_SECURITY is not set +# CONFIG_JFS_DEBUG is not set +# CONFIG_JFS_STATISTICS is not set +CONFIG_XFS_FS=m +CONFIG_XFS_SUPPORT_V4=y +# CONFIG_XFS_QUOTA is not set +# CONFIG_XFS_POSIX_ACL is not set +# CONFIG_XFS_RT is not set +# CONFIG_XFS_ONLINE_SCRUB is not set +# CONFIG_XFS_WARN is not set +# CONFIG_XFS_DEBUG is not set +# CONFIG_GFS2_FS is not set +# CONFIG_OCFS2_FS is not set +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set +# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set +# CONFIG_BTRFS_DEBUG is not set +# CONFIG_BTRFS_ASSERT is not set +# CONFIG_BTRFS_FS_REF_VERIFY is not set +# CONFIG_NILFS2_FS is not set +# CONFIG_F2FS_FS is not set +CONFIG_FS_POSIX_ACL=y +CONFIG_EXPORTFS=y +# CONFIG_EXPORTFS_BLOCK_OPS is not set +CONFIG_FILE_LOCKING=y +# CONFIG_FS_ENCRYPTION is not set +# CONFIG_FS_VERITY is not set +CONFIG_FSNOTIFY=y +CONFIG_DNOTIFY=y +CONFIG_INOTIFY_USER=y +CONFIG_FANOTIFY=y +# CONFIG_QUOTA is not set +CONFIG_AUTOFS4_FS=y +CONFIG_AUTOFS_FS=y +CONFIG_FUSE_FS=m +# CONFIG_CUSE is not set +# CONFIG_VIRTIO_FS is not set +CONFIG_OVERLAY_FS=m +# CONFIG_OVERLAY_FS_REDIRECT_DIR is not set +CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW=y +# CONFIG_OVERLAY_FS_INDEX is not set +# CONFIG_OVERLAY_FS_XINO_AUTO is not set +# CONFIG_OVERLAY_FS_METACOPY is not set + +# +# Caches +# +CONFIG_NETFS_SUPPORT=y +# CONFIG_NETFS_STATS is not set +CONFIG_FSCACHE=y +# CONFIG_FSCACHE_STATS is not set +# CONFIG_FSCACHE_DEBUG is not set +# CONFIG_CACHEFILES is not set +# end of Caches + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=y +# end of CD-ROM/DVD Filesystems + +# +# DOS/FAT/EXFAT/NT Filesystems +# +CONFIG_FAT_FS=y +# CONFIG_MSDOS_FS is not set +CONFIG_VFAT_FS=y +CONFIG_FAT_DEFAULT_CODEPAGE=437 +CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" +# CONFIG_FAT_DEFAULT_UTF8 is not set +CONFIG_EXFAT_FS=m +CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8" +# CONFIG_NTFS_FS is not set +CONFIG_NTFS3_FS=m +# CONFIG_NTFS3_64BIT_CLUSTER is not set +# CONFIG_NTFS3_LZX_XPRESS is not set +# CONFIG_NTFS3_FS_POSIX_ACL is not set +# end of DOS/FAT/EXFAT/NT Filesystems + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +# CONFIG_PROC_KCORE is not set +CONFIG_PROC_SYSCTL=y +CONFIG_PROC_PAGE_MONITOR=y +# CONFIG_PROC_CHILDREN is not set +CONFIG_KERNFS=y +CONFIG_SYSFS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_XATTR=y +# CONFIG_TMPFS_INODE64 is not set +CONFIG_ARCH_SUPPORTS_HUGETLBFS=y +# CONFIG_HUGETLBFS is not set +CONFIG_MEMFD_CREATE=y +CONFIG_ARCH_HAS_GIGANTIC_PAGE=y +CONFIG_CONFIGFS_FS=y +CONFIG_EFIVAR_FS=m +# end of Pseudo filesystems + +CONFIG_MISC_FILESYSTEMS=y +# CONFIG_ORANGEFS_FS is not set +# CONFIG_ADFS_FS is not set +# CONFIG_AFFS_FS is not set +# CONFIG_ECRYPT_FS is not set +CONFIG_HFS_FS=y +CONFIG_HFSPLUS_FS=y +# CONFIG_BEFS_FS is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +# CONFIG_CRAMFS is not set +CONFIG_SQUASHFS=y +# CONFIG_SQUASHFS_FILE_CACHE is not set +CONFIG_SQUASHFS_FILE_DIRECT=y +# CONFIG_SQUASHFS_DECOMP_SINGLE is not set +# CONFIG_SQUASHFS_DECOMP_MULTI is not set +CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU=y +# CONFIG_SQUASHFS_XATTR is not set +CONFIG_SQUASHFS_ZLIB=y +CONFIG_SQUASHFS_LZ4=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_SQUASHFS_ZSTD=y +# CONFIG_SQUASHFS_4K_DEVBLK_SIZE is not set +# CONFIG_SQUASHFS_EMBEDDED is not set +CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 +# CONFIG_VXFS_FS is not set +# CONFIG_MINIX_FS is not set +# CONFIG_OMFS_FS is not set +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_QNX6FS_FS is not set +# CONFIG_ROMFS_FS is not set +CONFIG_PSTORE=y +CONFIG_PSTORE_DEFAULT_KMSG_BYTES=10240 +CONFIG_PSTORE_DEFLATE_COMPRESS=y +# CONFIG_PSTORE_LZO_COMPRESS is not set +# CONFIG_PSTORE_LZ4_COMPRESS is not set +# CONFIG_PSTORE_LZ4HC_COMPRESS is not set +# CONFIG_PSTORE_842_COMPRESS is not set +# CONFIG_PSTORE_ZSTD_COMPRESS is not set +CONFIG_PSTORE_COMPRESS=y +CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT=y +CONFIG_PSTORE_COMPRESS_DEFAULT="deflate" +CONFIG_PSTORE_CONSOLE=y +# CONFIG_PSTORE_PMSG is not set +# CONFIG_PSTORE_FTRACE is not set +CONFIG_PSTORE_RAM=y +# CONFIG_PSTORE_BLK is not set +# CONFIG_SYSV_FS is not set +# CONFIG_UFS_FS is not set +# CONFIG_EROFS_FS is not set +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_NFS_FS=y +CONFIG_NFS_V2=y +CONFIG_NFS_V3=y +# CONFIG_NFS_V3_ACL is not set +CONFIG_NFS_V4=y +CONFIG_NFS_SWAP=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_PNFS_FILE_LAYOUT=y +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_FLEXFILE_LAYOUT=y +CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" +CONFIG_NFS_V4_1_MIGRATION=y +CONFIG_ROOT_NFS=y +CONFIG_NFS_FSCACHE=y +# CONFIG_NFS_USE_LEGACY_DNS is not set +CONFIG_NFS_USE_KERNEL_DNS=y +CONFIG_NFS_DISABLE_UDP_SUPPORT=y +# CONFIG_NFS_V4_2_READ_PLUS is not set +# CONFIG_NFSD is not set +CONFIG_GRACE_PERIOD=y +CONFIG_LOCKD=y +CONFIG_LOCKD_V4=y +CONFIG_NFS_COMMON=y +CONFIG_NFS_V4_2_SSC_HELPER=y +CONFIG_SUNRPC=y +CONFIG_SUNRPC_GSS=y +CONFIG_SUNRPC_BACKCHANNEL=y +CONFIG_SUNRPC_SWAP=y +CONFIG_RPCSEC_GSS_KRB5=m +# CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES is not set +# CONFIG_SUNRPC_DEBUG is not set +# CONFIG_CEPH_FS is not set +CONFIG_CIFS=y +CONFIG_CIFS_STATS2=y +CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y +# CONFIG_CIFS_UPCALL is not set +# CONFIG_CIFS_XATTR is not set +CONFIG_CIFS_DEBUG=y +# CONFIG_CIFS_DEBUG2 is not set +# CONFIG_CIFS_DEBUG_DUMP_KEYS is not set +# CONFIG_CIFS_DFS_UPCALL is not set +# CONFIG_CIFS_SWN_UPCALL is not set +CONFIG_CIFS_FSCACHE=y +# CONFIG_CIFS_ROOT is not set +# CONFIG_SMB_SERVER is not set +CONFIG_SMBFS=y +# CONFIG_CODA_FS is not set +# CONFIG_AFS_FS is not set +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=y +# CONFIG_NLS_CODEPAGE_737 is not set +# CONFIG_NLS_CODEPAGE_775 is not set +# CONFIG_NLS_CODEPAGE_850 is not set +# CONFIG_NLS_CODEPAGE_852 is not set +# CONFIG_NLS_CODEPAGE_855 is not set +# CONFIG_NLS_CODEPAGE_857 is not set +# CONFIG_NLS_CODEPAGE_860 is not set +# CONFIG_NLS_CODEPAGE_861 is not set +# CONFIG_NLS_CODEPAGE_862 is not set +# CONFIG_NLS_CODEPAGE_863 is not set +# CONFIG_NLS_CODEPAGE_864 is not set +# CONFIG_NLS_CODEPAGE_865 is not set +# CONFIG_NLS_CODEPAGE_866 is not set +# CONFIG_NLS_CODEPAGE_869 is not set +# CONFIG_NLS_CODEPAGE_936 is not set +# CONFIG_NLS_CODEPAGE_950 is not set +# CONFIG_NLS_CODEPAGE_932 is not set +# CONFIG_NLS_CODEPAGE_949 is not set +# CONFIG_NLS_CODEPAGE_874 is not set +# CONFIG_NLS_ISO8859_8 is not set +# CONFIG_NLS_CODEPAGE_1250 is not set +# CONFIG_NLS_CODEPAGE_1251 is not set +CONFIG_NLS_ASCII=y +CONFIG_NLS_ISO8859_1=y +# CONFIG_NLS_ISO8859_2 is not set +# CONFIG_NLS_ISO8859_3 is not set +# CONFIG_NLS_ISO8859_4 is not set +# CONFIG_NLS_ISO8859_5 is not set +# CONFIG_NLS_ISO8859_6 is not set +# CONFIG_NLS_ISO8859_7 is not set +# CONFIG_NLS_ISO8859_9 is not set +# CONFIG_NLS_ISO8859_13 is not set +# CONFIG_NLS_ISO8859_14 is not set +# CONFIG_NLS_ISO8859_15 is not set +# CONFIG_NLS_KOI8_R is not set +# CONFIG_NLS_KOI8_U is not set +# CONFIG_NLS_MAC_ROMAN is not set +# CONFIG_NLS_MAC_CELTIC is not set +# CONFIG_NLS_MAC_CENTEURO is not set +# CONFIG_NLS_MAC_CROATIAN is not set +# CONFIG_NLS_MAC_CYRILLIC is not set +# CONFIG_NLS_MAC_GAELIC is not set +# CONFIG_NLS_MAC_GREEK is not set +# CONFIG_NLS_MAC_ICELAND is not set +# CONFIG_NLS_MAC_INUIT is not set +# CONFIG_NLS_MAC_ROMANIAN is not set +# CONFIG_NLS_MAC_TURKISH is not set +CONFIG_NLS_UTF8=y +# CONFIG_DLM is not set +# CONFIG_UNICODE is not set +CONFIG_IO_WQ=y +# end of File systems + +# +# Security options +# +CONFIG_KEYS=y +# CONFIG_KEYS_REQUEST_CACHE is not set +# CONFIG_PERSISTENT_KEYRINGS is not set +# CONFIG_TRUSTED_KEYS is not set +# CONFIG_ENCRYPTED_KEYS is not set +CONFIG_KEY_DH_OPERATIONS=y +# CONFIG_SECURITY_DMESG_RESTRICT is not set +# CONFIG_SECURITY is not set +# CONFIG_SECURITYFS is not set +CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y +# CONFIG_HARDENED_USERCOPY is not set +# CONFIG_FORTIFY_SOURCE is not set +# CONFIG_STATIC_USERMODEHELPER is not set +# CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT is not set +CONFIG_DEFAULT_SECURITY_DAC=y +CONFIG_LSM="yama,loadpin,safesetid,integrity" + +# +# Kernel hardening options +# + +# +# Memory initialization +# +CONFIG_CC_HAS_AUTO_VAR_INIT_PATTERN=y +CONFIG_CC_HAS_AUTO_VAR_INIT_ZERO_BARE=y +CONFIG_CC_HAS_AUTO_VAR_INIT_ZERO=y +CONFIG_INIT_STACK_NONE=y +# CONFIG_INIT_STACK_ALL_PATTERN is not set +# CONFIG_INIT_STACK_ALL_ZERO is not set +# CONFIG_GCC_PLUGIN_STACKLEAK is not set +# CONFIG_INIT_ON_ALLOC_DEFAULT_ON is not set +# CONFIG_INIT_ON_FREE_DEFAULT_ON is not set +CONFIG_CC_HAS_ZERO_CALL_USED_REGS=y +# CONFIG_ZERO_CALL_USED_REGS is not set +# end of Memory initialization + +CONFIG_RANDSTRUCT_NONE=y +# CONFIG_RANDSTRUCT_FULL is not set +# CONFIG_RANDSTRUCT_PERFORMANCE is not set +# end of Kernel hardening options +# end of Security options + +CONFIG_XOR_BLOCKS=m +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_ALGAPI2=y +CONFIG_CRYPTO_AEAD=y +CONFIG_CRYPTO_AEAD2=y +CONFIG_CRYPTO_SKCIPHER=y +CONFIG_CRYPTO_SKCIPHER2=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG=y +CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_RNG_DEFAULT=y +CONFIG_CRYPTO_AKCIPHER2=y +CONFIG_CRYPTO_AKCIPHER=y +CONFIG_CRYPTO_KPP2=y +CONFIG_CRYPTO_KPP=y +CONFIG_CRYPTO_ACOMP2=y +CONFIG_CRYPTO_MANAGER=y +CONFIG_CRYPTO_MANAGER2=y +# CONFIG_CRYPTO_USER is not set +CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y +CONFIG_CRYPTO_GF128MUL=y +CONFIG_CRYPTO_NULL=y +CONFIG_CRYPTO_NULL2=y +# CONFIG_CRYPTO_PCRYPT is not set +CONFIG_CRYPTO_CRYPTD=y +CONFIG_CRYPTO_AUTHENC=y +# CONFIG_CRYPTO_TEST is not set +# end of Crypto core or helper + +# +# Public-key cryptography +# +CONFIG_CRYPTO_RSA=y +CONFIG_CRYPTO_DH=y +# CONFIG_CRYPTO_DH_RFC7919_GROUPS is not set +CONFIG_CRYPTO_ECC=m +CONFIG_CRYPTO_ECDH=m +# CONFIG_CRYPTO_ECDSA is not set +# CONFIG_CRYPTO_ECRDSA is not set +# CONFIG_CRYPTO_SM2 is not set +# CONFIG_CRYPTO_CURVE25519 is not set +# end of Public-key cryptography + +# +# Block ciphers +# +CONFIG_CRYPTO_AES=y +# CONFIG_CRYPTO_AES_TI is not set +# CONFIG_CRYPTO_ANUBIS is not set +# CONFIG_CRYPTO_ARIA is not set +# CONFIG_CRYPTO_BLOWFISH is not set +# CONFIG_CRYPTO_CAMELLIA is not set +# CONFIG_CRYPTO_CAST5 is not set +# CONFIG_CRYPTO_CAST6 is not set +CONFIG_CRYPTO_DES=y +# CONFIG_CRYPTO_FCRYPT is not set +# CONFIG_CRYPTO_KHAZAD is not set +# CONFIG_CRYPTO_SEED is not set +# CONFIG_CRYPTO_SERPENT is not set +CONFIG_CRYPTO_SM4=m +# CONFIG_CRYPTO_SM4_GENERIC is not set +# CONFIG_CRYPTO_TEA is not set +# CONFIG_CRYPTO_TWOFISH is not set +# end of Block ciphers + +# +# Length-preserving ciphers and modes +# +# CONFIG_CRYPTO_ADIANTUM is not set +# CONFIG_CRYPTO_ARC4 is not set +# CONFIG_CRYPTO_CHACHA20 is not set +CONFIG_CRYPTO_CBC=y +CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTR=y +CONFIG_CRYPTO_CTS=m +CONFIG_CRYPTO_ECB=y +# CONFIG_CRYPTO_HCTR2 is not set +# CONFIG_CRYPTO_KEYWRAP is not set +# CONFIG_CRYPTO_LRW is not set +CONFIG_CRYPTO_OFB=m +# CONFIG_CRYPTO_PCBC is not set +# CONFIG_CRYPTO_XTS is not set +# end of Length-preserving ciphers and modes + +# +# AEAD (authenticated encryption with associated data) ciphers +# +# CONFIG_CRYPTO_AEGIS128 is not set +# CONFIG_CRYPTO_CHACHA20POLY1305 is not set +CONFIG_CRYPTO_CCM=y +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_SEQIV=y +CONFIG_CRYPTO_ECHAINIV=y +# CONFIG_CRYPTO_ESSIV is not set +# end of AEAD (authenticated encryption with associated data) ciphers + +# +# Hashes, digests, and MACs +# +CONFIG_CRYPTO_BLAKE2B=m +CONFIG_CRYPTO_CMAC=y +CONFIG_CRYPTO_GHASH=y +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_MD4=y +CONFIG_CRYPTO_MD5=y +# CONFIG_CRYPTO_MICHAEL_MIC is not set +# CONFIG_CRYPTO_POLY1305 is not set +# CONFIG_CRYPTO_RMD160 is not set +CONFIG_CRYPTO_SHA1=y +CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m +# CONFIG_CRYPTO_SM3_GENERIC is not set +# CONFIG_CRYPTO_STREEBOG is not set +# CONFIG_CRYPTO_VMAC is not set +# CONFIG_CRYPTO_WP512 is not set +# CONFIG_CRYPTO_XCBC is not set +CONFIG_CRYPTO_XXHASH=m +# end of Hashes, digests, and MACs + +# +# CRCs (cyclic redundancy checks) +# +CONFIG_CRYPTO_CRC32C=y +CONFIG_CRYPTO_CRC32=y +# CONFIG_CRYPTO_CRCT10DIF is not set +# end of CRCs (cyclic redundancy checks) + +# +# Compression +# +CONFIG_CRYPTO_DEFLATE=y +CONFIG_CRYPTO_LZO=m +# CONFIG_CRYPTO_842 is not set +# CONFIG_CRYPTO_LZ4 is not set +# CONFIG_CRYPTO_LZ4HC is not set +# CONFIG_CRYPTO_ZSTD is not set +# end of Compression + +# +# Random number generation +# +# CONFIG_CRYPTO_ANSI_CPRNG is not set +CONFIG_CRYPTO_DRBG_MENU=y +CONFIG_CRYPTO_DRBG_HMAC=y +# CONFIG_CRYPTO_DRBG_HASH is not set +# CONFIG_CRYPTO_DRBG_CTR is not set +CONFIG_CRYPTO_DRBG=y +CONFIG_CRYPTO_JITTERENTROPY=y +CONFIG_CRYPTO_KDF800108_CTR=y +# end of Random number generation + +# +# Userspace interface +# +CONFIG_CRYPTO_USER_API=y +CONFIG_CRYPTO_USER_API_HASH=y +CONFIG_CRYPTO_USER_API_SKCIPHER=y +# CONFIG_CRYPTO_USER_API_RNG is not set +# CONFIG_CRYPTO_USER_API_AEAD is not set +CONFIG_CRYPTO_USER_API_ENABLE_OBSOLETE=y +# end of Userspace interface + +CONFIG_CRYPTO_HASH_INFO=y +# CONFIG_CRYPTO_NHPOLY1305_NEON is not set +CONFIG_CRYPTO_CHACHA20_NEON=m + +# +# Accelerated Cryptographic Algorithms for CPU (arm64) +# +CONFIG_CRYPTO_GHASH_ARM64_CE=m +CONFIG_CRYPTO_POLY1305_NEON=m +CONFIG_CRYPTO_SHA1_ARM64_CE=m +CONFIG_CRYPTO_SHA256_ARM64=m +CONFIG_CRYPTO_SHA2_ARM64_CE=m +CONFIG_CRYPTO_SHA512_ARM64=m +CONFIG_CRYPTO_SHA512_ARM64_CE=m +CONFIG_CRYPTO_SHA3_ARM64=m +# CONFIG_CRYPTO_SM3_NEON is not set +CONFIG_CRYPTO_SM3_ARM64_CE=m +# CONFIG_CRYPTO_POLYVAL_ARM64_CE is not set +CONFIG_CRYPTO_AES_ARM64=y +CONFIG_CRYPTO_AES_ARM64_CE=m +CONFIG_CRYPTO_AES_ARM64_CE_BLK=m +CONFIG_CRYPTO_AES_ARM64_NEON_BLK=m +CONFIG_CRYPTO_AES_ARM64_BS=m +CONFIG_CRYPTO_SM4_ARM64_CE=m +# CONFIG_CRYPTO_SM4_ARM64_CE_BLK is not set +# CONFIG_CRYPTO_SM4_ARM64_NEON_BLK is not set +CONFIG_CRYPTO_AES_ARM64_CE_CCM=m +# end of Accelerated Cryptographic Algorithms for CPU (arm64) + +# CONFIG_CRYPTO_HW is not set +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS8_PRIVATE_KEY_PARSER=m +CONFIG_PKCS7_MESSAGE_PARSER=y +# CONFIG_PKCS7_TEST_KEY is not set +# CONFIG_SIGNED_PE_FILE_VERIFICATION is not set +# CONFIG_FIPS_SIGNATURE_SELFTEST is not set + +# +# Certificates for signature checking +# +CONFIG_SYSTEM_TRUSTED_KEYRING=y +CONFIG_SYSTEM_TRUSTED_KEYS="" +# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set +# CONFIG_SECONDARY_TRUSTED_KEYRING is not set +# CONFIG_SYSTEM_BLACKLIST_KEYRING is not set +# end of Certificates for signature checking + +CONFIG_BINARY_PRINTF=y + +# +# Library routines +# +CONFIG_RAID6_PQ=m +CONFIG_RAID6_PQ_BENCHMARK=y +CONFIG_LINEAR_RANGES=y +# CONFIG_PACKING is not set +CONFIG_BITREVERSE=y +CONFIG_HAVE_ARCH_BITREVERSE=y +CONFIG_GENERIC_STRNCPY_FROM_USER=y +CONFIG_GENERIC_STRNLEN_USER=y +CONFIG_GENERIC_NET_UTILS=y +CONFIG_CORDIC=m +# CONFIG_PRIME_NUMBERS is not set +CONFIG_RATIONAL=y +CONFIG_GENERIC_PCI_IOMAP=y +CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y +CONFIG_ARCH_HAS_FAST_MULTIPLIER=y +CONFIG_ARCH_USE_SYM_ANNOTATIONS=y +# CONFIG_INDIRECT_PIO is not set +# CONFIG_TRACE_MMIO_ACCESS is not set + +# +# Crypto library routines +# +CONFIG_CRYPTO_LIB_UTILS=y +CONFIG_CRYPTO_LIB_AES=y +CONFIG_CRYPTO_LIB_ARC4=m +CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=y +CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=m +CONFIG_CRYPTO_LIB_CHACHA_GENERIC=m +CONFIG_CRYPTO_LIB_CHACHA=m +CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m +CONFIG_CRYPTO_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_DES=y +CONFIG_CRYPTO_LIB_POLY1305_RSIZE=9 +CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=m +CONFIG_CRYPTO_LIB_POLY1305=m +CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m +CONFIG_CRYPTO_LIB_SHA1=y +CONFIG_CRYPTO_LIB_SHA256=y +# end of Crypto library routines + +CONFIG_CRC_CCITT=m +CONFIG_CRC16=y +# CONFIG_CRC_T10DIF is not set +# CONFIG_CRC64_ROCKSOFT is not set +CONFIG_CRC_ITU_T=y +CONFIG_CRC32=y +# CONFIG_CRC32_SELFTEST is not set +CONFIG_CRC32_SLICEBY8=y +# CONFIG_CRC32_SLICEBY4 is not set +# CONFIG_CRC32_SARWATE is not set +# CONFIG_CRC32_BIT is not set +# CONFIG_CRC64 is not set +# CONFIG_CRC4 is not set +# CONFIG_CRC7 is not set +CONFIG_LIBCRC32C=m +# CONFIG_CRC8 is not set +CONFIG_XXHASH=y +CONFIG_AUDIT_ARCH_COMPAT_GENERIC=y +# CONFIG_RANDOM32_SELFTEST is not set +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=y +CONFIG_LZO_COMPRESS=m +CONFIG_LZO_DECOMPRESS=y +CONFIG_LZ4_DECOMPRESS=y +CONFIG_ZSTD_COMMON=y +CONFIG_ZSTD_COMPRESS=m +CONFIG_ZSTD_DECOMPRESS=y +CONFIG_XZ_DEC=y +# CONFIG_XZ_DEC_X86 is not set +# CONFIG_XZ_DEC_POWERPC is not set +# CONFIG_XZ_DEC_IA64 is not set +# CONFIG_XZ_DEC_ARM is not set +# CONFIG_XZ_DEC_ARMTHUMB is not set +# CONFIG_XZ_DEC_SPARC is not set +# CONFIG_XZ_DEC_MICROLZMA is not set +# CONFIG_XZ_DEC_TEST is not set +CONFIG_GENERIC_ALLOCATOR=y +CONFIG_REED_SOLOMON=y +CONFIG_REED_SOLOMON_ENC8=y +CONFIG_REED_SOLOMON_DEC8=y +CONFIG_XARRAY_MULTI=y +CONFIG_ASSOCIATIVE_ARRAY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT_MAP=y +CONFIG_HAS_DMA=y +CONFIG_DMA_OPS=y +CONFIG_NEED_SG_DMA_LENGTH=y +CONFIG_NEED_DMA_MAP_STATE=y +CONFIG_ARCH_DMA_ADDR_T_64BIT=y +CONFIG_DMA_DECLARE_COHERENT=y +CONFIG_ARCH_HAS_SETUP_DMA_OPS=y +CONFIG_ARCH_HAS_TEARDOWN_DMA_OPS=y +CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE=y +CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU=y +CONFIG_ARCH_HAS_DMA_PREP_COHERENT=y +CONFIG_SWIOTLB=y +# CONFIG_DMA_RESTRICTED_POOL is not set +CONFIG_DMA_NONCOHERENT_MMAP=y +CONFIG_DMA_COHERENT_POOL=y +CONFIG_DMA_DIRECT_REMAP=y +CONFIG_DMA_CMA=y +# CONFIG_DMA_PERNUMA_CMA is not set + +# +# Default contiguous memory area size: +# +CONFIG_CMA_SIZE_MBYTES=5 +CONFIG_CMA_SIZE_SEL_MBYTES=y +# CONFIG_CMA_SIZE_SEL_PERCENTAGE is not set +# CONFIG_CMA_SIZE_SEL_MIN is not set +# CONFIG_CMA_SIZE_SEL_MAX is not set +CONFIG_CMA_ALIGNMENT=8 +# CONFIG_DMA_API_DEBUG is not set +# CONFIG_DMA_MAP_BENCHMARK is not set +CONFIG_SGL_ALLOC=y +# CONFIG_FORCE_NR_CPUS is not set +CONFIG_CPU_RMAP=y +CONFIG_DQL=y +CONFIG_GLOB=y +# CONFIG_GLOB_SELFTEST is not set +CONFIG_NLATTR=y +CONFIG_CLZ_TAB=y +# CONFIG_IRQ_POLL is not set +CONFIG_MPILIB=y +CONFIG_DIMLIB=y +CONFIG_LIBFDT=y +CONFIG_OID_REGISTRY=y +CONFIG_UCS2_STRING=y +CONFIG_HAVE_GENERIC_VDSO=y +CONFIG_GENERIC_GETTIMEOFDAY=y +CONFIG_GENERIC_VDSO_TIME_NS=y +CONFIG_FONT_SUPPORT=y +# CONFIG_FONTS is not set +CONFIG_FONT_8x8=y +CONFIG_FONT_8x16=y +CONFIG_SG_POOL=y +CONFIG_ARCH_STACKWALK=y +CONFIG_STACKDEPOT=y +CONFIG_SBITMAP=y +# end of Library routines + +CONFIG_GENERIC_IOREMAP=y +CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED=y + +# +# Kernel hacking +# + +# +# printk and dmesg options +# +CONFIG_PRINTK_TIME=y +# CONFIG_PRINTK_CALLER is not set +# CONFIG_STACKTRACE_BUILD_ID is not set +CONFIG_CONSOLE_LOGLEVEL_DEFAULT=7 +CONFIG_CONSOLE_LOGLEVEL_QUIET=4 +CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 +# CONFIG_BOOT_PRINTK_DELAY is not set +CONFIG_DYNAMIC_DEBUG=y +CONFIG_DYNAMIC_DEBUG_CORE=y +CONFIG_SYMBOLIC_ERRNAME=y +# CONFIG_DEBUG_BUGVERBOSE is not set +# end of printk and dmesg options + +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_MISC=y + +# +# Compile-time checks and compiler options +# +CONFIG_AS_HAS_NON_CONST_LEB128=y +CONFIG_DEBUG_INFO_NONE=y +# CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT is not set +# CONFIG_DEBUG_INFO_DWARF4 is not set +# CONFIG_DEBUG_INFO_DWARF5 is not set +CONFIG_FRAME_WARN=2048 +CONFIG_STRIP_ASM_SYMS=y +# CONFIG_READABLE_ASM is not set +# CONFIG_HEADERS_INSTALL is not set +# CONFIG_DEBUG_SECTION_MISMATCH is not set +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +# CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B is not set +CONFIG_ARCH_WANT_FRAME_POINTERS=y +CONFIG_FRAME_POINTER=y +# CONFIG_VMLINUX_MAP is not set +# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set +# end of Compile-time checks and compiler options + +# +# Generic Kernel Debugging Instruments +# +CONFIG_MAGIC_SYSRQ=y +CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x1 +CONFIG_MAGIC_SYSRQ_SERIAL=y +CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE="" +CONFIG_DEBUG_FS=y +CONFIG_DEBUG_FS_ALLOW_ALL=y +# CONFIG_DEBUG_FS_DISALLOW_MOUNT is not set +# CONFIG_DEBUG_FS_ALLOW_NONE is not set +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_KGDB is not set +CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y +# CONFIG_UBSAN is not set +CONFIG_HAVE_ARCH_KCSAN=y +CONFIG_HAVE_KCSAN_COMPILER=y +# CONFIG_KCSAN is not set +# end of Generic Kernel Debugging Instruments + +# +# Networking Debugging +# +# CONFIG_NET_DEV_REFCNT_TRACKER is not set +# CONFIG_NET_NS_REFCNT_TRACKER is not set +# CONFIG_DEBUG_NET is not set +# end of Networking Debugging + +# +# Memory Debugging +# +# CONFIG_PAGE_EXTENSION is not set +# CONFIG_DEBUG_PAGEALLOC is not set +CONFIG_SLUB_DEBUG=y +# CONFIG_SLUB_DEBUG_ON is not set +# CONFIG_PAGE_OWNER is not set +# CONFIG_PAGE_POISONING is not set +# CONFIG_DEBUG_PAGE_REF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +CONFIG_ARCH_HAS_DEBUG_WX=y +# CONFIG_DEBUG_WX is not set +CONFIG_GENERIC_PTDUMP=y +# CONFIG_PTDUMP_DEBUGFS is not set +# CONFIG_DEBUG_OBJECTS is not set +# CONFIG_SHRINKER_DEBUG is not set +CONFIG_HAVE_DEBUG_KMEMLEAK=y +# CONFIG_DEBUG_KMEMLEAK is not set +# CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_SCHED_STACK_END_CHECK is not set +CONFIG_ARCH_HAS_DEBUG_VM_PGTABLE=y +# CONFIG_DEBUG_VM is not set +# CONFIG_DEBUG_VM_PGTABLE is not set +CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y +# CONFIG_DEBUG_VIRTUAL is not set +# CONFIG_DEBUG_MEMORY_INIT is not set +# CONFIG_DEBUG_PER_CPU_MAPS is not set +CONFIG_HAVE_ARCH_KASAN=y +CONFIG_HAVE_ARCH_KASAN_SW_TAGS=y +CONFIG_HAVE_ARCH_KASAN_HW_TAGS=y +CONFIG_HAVE_ARCH_KASAN_VMALLOC=y +CONFIG_CC_HAS_KASAN_GENERIC=y +CONFIG_CC_HAS_KASAN_SW_TAGS=y +CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y +# CONFIG_KASAN is not set +CONFIG_HAVE_ARCH_KFENCE=y +# CONFIG_KFENCE is not set +# end of Memory Debugging + +# CONFIG_DEBUG_SHIRQ is not set + +# +# Debug Oops, Lockups and Hangs +# +# CONFIG_PANIC_ON_OOPS is not set +CONFIG_PANIC_ON_OOPS_VALUE=0 +CONFIG_PANIC_TIMEOUT=0 +# CONFIG_SOFTLOCKUP_DETECTOR is not set +# CONFIG_DETECT_HUNG_TASK is not set +# CONFIG_WQ_WATCHDOG is not set +# CONFIG_TEST_LOCKUP is not set +# end of Debug Oops, Lockups and Hangs + +# +# Scheduler Debugging +# +# CONFIG_SCHED_DEBUG is not set +# CONFIG_SCHEDSTATS is not set +# end of Scheduler Debugging + +# CONFIG_DEBUG_TIMEKEEPING is not set + +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +CONFIG_LOCK_DEBUGGING_SUPPORT=y +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set +# CONFIG_DEBUG_RWSEMS is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_DEBUG_ATOMIC_SLEEP is not set +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +# CONFIG_LOCK_TORTURE_TEST is not set +# CONFIG_WW_MUTEX_SELFTEST is not set +# CONFIG_SCF_TORTURE_TEST is not set +# CONFIG_CSD_LOCK_WAIT_DEBUG is not set +# end of Lock Debugging (spinlocks, mutexes, etc...) + +CONFIG_TRACE_IRQFLAGS=y +CONFIG_TRACE_IRQFLAGS_NMI=y +# CONFIG_DEBUG_IRQFLAGS is not set +CONFIG_STACKTRACE=y +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +# CONFIG_DEBUG_KOBJECT is not set + +# +# Debug kernel data structures +# +# CONFIG_DEBUG_LIST is not set +# CONFIG_DEBUG_PLIST is not set +# CONFIG_DEBUG_SG is not set +# CONFIG_DEBUG_NOTIFIERS is not set +# CONFIG_BUG_ON_DATA_CORRUPTION is not set +# CONFIG_DEBUG_MAPLE_TREE is not set +# end of Debug kernel data structures + +# CONFIG_DEBUG_CREDENTIALS is not set + +# +# RCU Debugging +# +# CONFIG_RCU_SCALE_TEST is not set +# CONFIG_RCU_TORTURE_TEST is not set +# CONFIG_RCU_REF_SCALE_TEST is not set +CONFIG_RCU_CPU_STALL_TIMEOUT=21 +CONFIG_RCU_EXP_CPU_STALL_TIMEOUT=0 +# CONFIG_RCU_TRACE is not set +# CONFIG_RCU_EQS_DEBUG is not set +# end of RCU Debugging + +# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set +# CONFIG_LATENCYTOP is not set +CONFIG_NOP_TRACER=y +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y +CONFIG_HAVE_C_RECORDMCOUNT=y +CONFIG_TRACER_MAX_TRACE=y +CONFIG_TRACE_CLOCK=y +CONFIG_RING_BUFFER=y +CONFIG_EVENT_TRACING=y +CONFIG_CONTEXT_SWITCH_TRACER=y +CONFIG_RING_BUFFER_ALLOW_SWAP=y +CONFIG_PREEMPTIRQ_TRACEPOINTS=y +CONFIG_TRACING=y +CONFIG_GENERIC_TRACER=y +CONFIG_TRACING_SUPPORT=y +CONFIG_FTRACE=y +CONFIG_BOOTTIME_TRACING=y +CONFIG_FUNCTION_TRACER=y +CONFIG_FUNCTION_GRAPH_TRACER=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_DYNAMIC_FTRACE_WITH_REGS=y +# CONFIG_FUNCTION_PROFILER is not set +CONFIG_STACK_TRACER=y +CONFIG_IRQSOFF_TRACER=y +CONFIG_SCHED_TRACER=y +# CONFIG_HWLAT_TRACER is not set +# CONFIG_OSNOISE_TRACER is not set +# CONFIG_TIMERLAT_TRACER is not set +# CONFIG_FTRACE_SYSCALLS is not set +CONFIG_TRACER_SNAPSHOT=y +CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +# CONFIG_PROFILE_ALL_BRANCHES is not set +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_KPROBE_EVENTS=y +# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set +# CONFIG_UPROBE_EVENTS is not set +CONFIG_BPF_EVENTS=y +CONFIG_DYNAMIC_EVENTS=y +CONFIG_PROBE_EVENTS=y +# CONFIG_BPF_KPROBE_OVERRIDE is not set +CONFIG_FTRACE_MCOUNT_RECORD=y +CONFIG_FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY=y +# CONFIG_SYNTH_EVENTS is not set +# CONFIG_HIST_TRIGGERS is not set +# CONFIG_TRACE_EVENT_INJECT is not set +# CONFIG_TRACEPOINT_BENCHMARK is not set +# CONFIG_RING_BUFFER_BENCHMARK is not set +# CONFIG_TRACE_EVAL_MAP_FILE is not set +# CONFIG_FTRACE_RECORD_RECURSION is not set +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_RING_BUFFER_STARTUP_TEST is not set +# CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS is not set +# CONFIG_PREEMPTIRQ_DELAY_TEST is not set +# CONFIG_KPROBE_EVENT_GEN_TEST is not set +# CONFIG_RV is not set +# CONFIG_SAMPLES is not set +# CONFIG_STRICT_DEVMEM is not set + +# +# arm64 Debugging +# +# CONFIG_PID_IN_CONTEXTIDR is not set +# CONFIG_ARM64_RELOC_TEST is not set +# CONFIG_CORESIGHT is not set +# end of arm64 Debugging + +# +# Kernel Testing and Coverage +# +# CONFIG_KUNIT is not set +# CONFIG_NOTIFIER_ERROR_INJECTION is not set +CONFIG_FUNCTION_ERROR_INJECTION=y +# CONFIG_FAULT_INJECTION is not set +CONFIG_ARCH_HAS_KCOV=y +CONFIG_CC_HAS_SANCOV_TRACE_PC=y +# CONFIG_KCOV is not set +# CONFIG_RUNTIME_TESTING_MENU is not set +CONFIG_ARCH_USE_MEMTEST=y +# CONFIG_MEMTEST is not set +# end of Kernel Testing and Coverage + +# +# Rust hacking +# +# end of Rust hacking +# end of Kernel hacking diff --git a/projects/RPi/devices/RPi5/options b/projects/RPi/devices/RPi5/options new file mode 100644 index 0000000000..1377a7e468 --- /dev/null +++ b/projects/RPi/devices/RPi5/options @@ -0,0 +1,43 @@ +################################################################################ +# Device defaults +################################################################################ + + # NOOBS supported hex versions (legacy) is not relevant for RPi4 + unset NOOBS_HEX + + # NOOBS supported model versions + NOOBS_SUPPORTED_MODELS='"Pi 5"' + + # additional Firmware to use (dvb-firmware, misc-firmware, wlan-firmware) + FIRMWARE="${FIRMWARE} rpi-eeprom flashrom" + + # set the addon project + ADDON_PROJECT="ARMv8" + + # The TARGET_CPU variable controls which processor should be targeted for + # generated code. + case $TARGET_ARCH in + aarch64) + TARGET_CPU="cortex-a76" + TARGET_CPU_FLAGS="+crc+crypto" + ;; + arm) + TARGET_KERNEL_ARCH="arm64" + TARGET_KERNEL_PATCH_ARCH="aarch64" + TARGET_FLOAT="hard" + # cortex-a72 caused issues in the past, so use a53 + TARGET_CPU="cortex-a53" + TARGET_CPU_FLAGS="+crc+crypto" + TARGET_FPU="neon-fp-armv8" + ;; + esac + + # Kernel target + KERNEL_TARGET="Image" + + # debug tty path + DEBUG_TTY="/dev/ttyAMA10" + + # serial console + EXTRA_CMDLINE="console=ttyAMA10,115200 console=tty0" + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0001-broadcom-cle-clif-common-simulator-add-7.1-version-o.patch b/projects/RPi/devices/RPi5/patches/mesa/0001-broadcom-cle-clif-common-simulator-add-7.1-version-o.patch new file mode 100644 index 0000000000..ee9e032293 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0001-broadcom-cle-clif-common-simulator-add-7.1-version-o.patch @@ -0,0 +1,332 @@ +From f62aa2640f92796ff5216da0a5d3c8f46a2855b4 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Mon, 26 Apr 2021 00:02:21 +0200 +Subject: [PATCH 001/142] broadcom(cle,clif,common,simulator): add 7.1 version + on the list of versions to build + +This adds 7.1 to the list of available V3D_VERSION, and first changes +on the simulator needed to get it working. + +Note that we needed to touch all those 4 codebases because it is +needed if we want to use V3D_DEBUG=clif with the simulator, that it is +the easier way to see which packets a vulkan program is using. + +About the simulator, this commit only handle the rename of some +registers. Any additional changes needed to get a proper support for +v71 will be handled them on following commits. +--- + src/broadcom/cle/meson.build | 3 +- + src/broadcom/cle/v3dx_pack.h | 2 + + src/broadcom/clif/clif_private.h | 2 + + src/broadcom/common/v3d_device_info.c | 1 + + src/broadcom/common/v3d_macros.h | 3 + + src/broadcom/meson.build | 2 +- + src/broadcom/simulator/v3d_simulator.c | 81 +++++++++++++++++++------ + src/broadcom/simulator/v3d_simulator.h | 5 ++ + src/broadcom/simulator/v3dx_simulator.c | 31 ++++++++-- + 9 files changed, 106 insertions(+), 24 deletions(-) + +diff --git a/src/broadcom/cle/meson.build b/src/broadcom/cle/meson.build +index 31a0d5bfa94..8ac32b313e4 100644 +--- a/src/broadcom/cle/meson.build ++++ b/src/broadcom/cle/meson.build +@@ -23,7 +23,8 @@ v3d_versions = [ + [21, 21], + [33, 33], + [41, 33], +- [42, 33] ++ [42, 33], ++ [71, 33] + ] + + v3d_xml_files = [] +diff --git a/src/broadcom/cle/v3dx_pack.h b/src/broadcom/cle/v3dx_pack.h +index 5762e5aaa70..e5a1eb26698 100644 +--- a/src/broadcom/cle/v3dx_pack.h ++++ b/src/broadcom/cle/v3dx_pack.h +@@ -37,6 +37,8 @@ + # include "cle/v3d_packet_v41_pack.h" + #elif (V3D_VERSION == 42) + # include "cle/v3d_packet_v42_pack.h" ++#elif (V3D_VERSION == 71) ++# include "cle/v3d_packet_v71_pack.h" + #else + # error "Need to add a pack header include for this v3d version" + #endif +diff --git a/src/broadcom/clif/clif_private.h b/src/broadcom/clif/clif_private.h +index 6ace62b0310..cda407a00bf 100644 +--- a/src/broadcom/clif/clif_private.h ++++ b/src/broadcom/clif/clif_private.h +@@ -101,6 +101,8 @@ bool v3d41_clif_dump_packet(struct clif_dump *clif, uint32_t offset, + const uint8_t *cl, uint32_t *size, bool reloc_mode); + bool v3d42_clif_dump_packet(struct clif_dump *clif, uint32_t offset, + const uint8_t *cl, uint32_t *size, bool reloc_mode); ++bool v3d71_clif_dump_packet(struct clif_dump *clif, uint32_t offset, ++ const uint8_t *cl, uint32_t *size, bool reloc_mode); + + static inline void + out(struct clif_dump *clif, const char *fmt, ...) +diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c +index 272190eb2e5..7e0862f1f02 100644 +--- a/src/broadcom/common/v3d_device_info.c ++++ b/src/broadcom/common/v3d_device_info.c +@@ -66,6 +66,7 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i + case 33: + case 41: + case 42: ++ case 71: + break; + default: + fprintf(stderr, +diff --git a/src/broadcom/common/v3d_macros.h b/src/broadcom/common/v3d_macros.h +index fe89398208a..b4291fb5350 100644 +--- a/src/broadcom/common/v3d_macros.h ++++ b/src/broadcom/common/v3d_macros.h +@@ -41,6 +41,9 @@ + #elif (V3D_VERSION == 42) + # define V3DX(x) V3D42_##x + # define v3dX(x) v3d42_##x ++#elif (V3D_VERSION == 71) ++# define V3DX(x) V3D71_##x ++# define v3dX(x) v3d71_##x + #else + # error "Need to add prefixing macros for this v3d version" + #endif +diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build +index 2c10e46b188..73cb7aa0575 100644 +--- a/src/broadcom/meson.build ++++ b/src/broadcom/meson.build +@@ -22,7 +22,7 @@ inc_broadcom = include_directories('.', 'cle') + + subdir('cle') + +-v3d_versions = ['33', '41', '42'] ++v3d_versions = ['33', '41', '42', '71'] + v3d_libs = [] + + if with_gallium_v3d or with_broadcom_vk +diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c +index eea5d3f050e..5cceb1a82cc 100644 +--- a/src/broadcom/simulator/v3d_simulator.c ++++ b/src/broadcom/simulator/v3d_simulator.c +@@ -490,10 +490,20 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit) + + v3d_simulator_perfmon_switch(fd, submit->perfmon_id); + +- if (sim_state.ver >= 41) +- v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs); +- else +- v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs); ++ switch(sim_state.ver) { ++ case 33: ++ v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs); ++ break; ++ case 41: ++ case 42: ++ v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs); ++ break; ++ case 71: ++ v3d71_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs); ++ break; ++ default: ++ unreachable("Unsupported V3D version\n"); ++ } + + util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *, + sim_bo) { +@@ -635,10 +645,17 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args) + static int + v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args) + { +- if (sim_state.ver >= 41) +- return v3d41_simulator_get_param_ioctl(sim_state.v3d, args); +- else ++ switch(sim_state.ver) { ++ case 33: + return v3d33_simulator_get_param_ioctl(sim_state.v3d, args); ++ case 41: ++ case 42: ++ return v3d41_simulator_get_param_ioctl(sim_state.v3d, args); ++ case 71: ++ return v3d71_simulator_get_param_ioctl(sim_state.v3d, args); ++ default: ++ unreachable("Unsupported V3D version\n"); ++ } + } + + static int +@@ -652,10 +669,20 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args) + v3d_simulator_copy_in_handle(file, args->bo_handles[2]); + v3d_simulator_copy_in_handle(file, args->bo_handles[3]); + +- if (sim_state.ver >= 41) +- ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args); +- else ++ switch(sim_state.ver) { ++ case 33: + ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args); ++ break; ++ case 41: ++ case 42: ++ ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args); ++ break; ++ case 71: ++ ret = v3d71_simulator_submit_tfu_ioctl(sim_state.v3d, args); ++ break; ++ default: ++ unreachable("Unsupported V3D version\n"); ++ } + + v3d_simulator_copy_out_handle(file, args->bo_handles[0]); + +@@ -682,11 +709,19 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args) + + v3d_simulator_perfmon_switch(fd, args->perfmon_id); + +- if (sim_state.ver >= 41) +- ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args, +- file->gmp->ofs); +- else +- ret = -1; ++ switch(sim_state.ver) { ++ case 41: ++ case 42: ++ ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args, ++ file->gmp->ofs); ++ break; ++ case 71: ++ ret = v3d71_simulator_submit_csd_ioctl(sim_state.v3d, args, ++ file->gmp->ofs); ++ break; ++ default: ++ ret = -1; ++ } + + for (int i = 0; i < args->bo_handle_count; i++) + v3d_simulator_copy_out_handle(file, bo_handles[i]); +@@ -880,10 +915,20 @@ v3d_simulator_init_global() + + util_dynarray_init(&sim_state.bin_oom, NULL); + +- if (sim_state.ver >= 41) +- v3d41_simulator_init_regs(sim_state.v3d); +- else ++ switch(sim_state.ver) { ++ case 33: + v3d33_simulator_init_regs(sim_state.v3d); ++ break; ++ case 41: ++ case 42: ++ v3d41_simulator_init_regs(sim_state.v3d); ++ break; ++ case 71: ++ v3d71_simulator_init_regs(sim_state.v3d); ++ break; ++ default: ++ unreachable("Not supported V3D version\n"); ++ } + } + + struct v3d_simulator_file * +diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h +index ddb079c1455..1472c313a03 100644 +--- a/src/broadcom/simulator/v3d_simulator.h ++++ b/src/broadcom/simulator/v3d_simulator.h +@@ -52,6 +52,11 @@ uint32_t v3d_simulator_get_mem_free(void); + # define v3dX(x) v3d41_##x + # include "v3dx_simulator.h" + # undef v3dX ++ ++# define v3dX(x) v3d71_##x ++# include "v3dx_simulator.h" ++# undef v3dX ++ + #endif + + #endif +diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c +index c9322f0397b..723796b16c9 100644 +--- a/src/broadcom/simulator/v3dx_simulator.c ++++ b/src/broadcom/simulator/v3dx_simulator.c +@@ -46,11 +46,15 @@ + + #define HW_REGISTER_RO(x) (x) + #define HW_REGISTER_RW(x) (x) +-#if V3D_VERSION >= 41 ++#if V3D_VERSION == 71 ++#include "libs/core/v3d/registers/7.1.5.1/v3d.h" ++#else ++#if V3D_VERSION == 41 || V3D_VERSION == 42 + #include "libs/core/v3d/registers/4.1.35.0/v3d.h" + #else + #include "libs/core/v3d/registers/3.3.0.0/v3d.h" + #endif ++#endif + + #define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val) + #define V3D_READ(reg) v3d_hw_read_reg(v3d, reg) +@@ -310,16 +314,17 @@ v3d_isr_core(struct v3d_hw *v3d, + return; + } + ++#if V3D_VERSION <= 42 + if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) { + fprintf(stderr, "GMP violation at 0x%08x\n", + V3D_READ(V3D_GMP_VIO_ADDR)); +- abort(); + } else { + fprintf(stderr, + "Unexpected ISR with core status 0x%08x\n", + core_status); + } + abort(); ++#endif + } + + static void +@@ -396,6 +401,18 @@ v3d_isr_hub(struct v3d_hw *v3d) + } + + handle_mmu_interruptions(v3d, hub_status); ++ ++#if V3D_VERSION == 71 ++ if (hub_status & V3D_HUB_CTL_INT_STS_INT_GMPV_SET) { ++ fprintf(stderr, "GMP violation at 0x%08x\n", ++ V3D_READ(V3D_GMP_VIO_ADDR)); ++ } else { ++ fprintf(stderr, ++ "Unexpected ISR with status 0x%08x\n", ++ hub_status); ++ } ++ abort(); ++#endif + } + + static void +@@ -436,8 +453,11 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d) + * for tracing. Perhaps we should evaluate to do the same here and add + * some debug options. + */ +- uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET | +- V3D_CTL_0_INT_STS_INT_OUTOMEM_SET); ++ uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_OUTOMEM_SET; ++#if V3D_VERSION <= 42 ++ core_interrupts |= V3D_CTL_0_INT_STS_INT_GMPV_SET; ++#endif ++ + V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts); + V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts); + +@@ -447,6 +467,9 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d) + V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET | /* CAP exceeded */ + V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */ + ++#if V3D_VERSION == 71 ++ hub_interrupts |= V3D_HUB_CTL_INT_STS_INT_GMPV_SET; ++#endif + V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts); + V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts); + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0002-broadcom-simulator-reset-CFG7-for-compute-dispatch-i.patch b/projects/RPi/devices/RPi5/patches/mesa/0002-broadcom-simulator-reset-CFG7-for-compute-dispatch-i.patch new file mode 100644 index 0000000000..5224359446 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0002-broadcom-simulator-reset-CFG7-for-compute-dispatch-i.patch @@ -0,0 +1,30 @@ +From 9e85edd1b347b0e779b393f463f42044a720bcff Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 28 Sep 2021 13:16:49 +0200 +Subject: [PATCH 002/142] broadcom/simulator: reset CFG7 for compute dispatch + in v71 + +This register is new in 7.x, it doesn't seem that we need to +do anything specific for now, but let's make sure it is reset +every time. +--- + src/broadcom/simulator/v3dx_simulator.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c +index 723796b16c9..f23b0538de3 100644 +--- a/src/broadcom/simulator/v3dx_simulator.c ++++ b/src/broadcom/simulator/v3dx_simulator.c +@@ -227,6 +227,9 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, + V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]); + V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]); + V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]); ++#if V3D_VERSION >= 71 ++ V3D_WRITE(V3D_CSD_0_QUEUED_CFG7, 0); ++#endif + /* CFG0 kicks off the job */ + V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]); + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0003-broadcom-cle-update-the-packet-definitions-for-new-g.patch b/projects/RPi/devices/RPi5/patches/mesa/0003-broadcom-cle-update-the-packet-definitions-for-new-g.patch new file mode 100644 index 0000000000..80190c0aef --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0003-broadcom-cle-update-the-packet-definitions-for-new-g.patch @@ -0,0 +1,712 @@ +From 6f744bc4bec98f9769486d427e8e2d4e314ae056 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Tue, 29 Jun 2021 12:03:24 +0200 +Subject: [PATCH 003/142] broadcom/cle: update the packet definitions for new + generation v71 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Using as reference the spec for 7.1.5. This include totally new +packets, and redefine some that already existed on v42. + +Full list: + * Add Depth Bounds Test Limits + * Redefine Tile Binning Mode Cfg + * Redefine Cfg Bits. There are some changes on the fields: + * Line Rasterization is now 1 bit size + * Depth Bounds Enable (that takes one of the bits of Line Rasterization) + * Early-Z/Early-Z updates enable bits (16-17) figure now as reserved. + * New Z-Clipping mode field + * Redefine Tile Rendering Mode Cfg (Common). Changes with respect to v42: + * New log2 tile height/width fields starting at bit 52/55 + * Due those two news, end pad is smaller + * sub-id has now a size of 3. Bit 4 is reserved. + * Number of render targets: this field max value is now 7 (not + reflected on the xml). + * Maximum BPP is removed on v71 (now bits 40-41 are reserved) + * Depth Buffer disable: on bit 44 + * Update Store Tile Buffer General + * Adding Cfg Render Target Part1/2/3 packets: they replace v4X "Tile + Rendering Mode Cfg (Color)" (real name "Rendering Configuration + (Render Targets Config)"), "Tile Rendering Mode Cfg (Clear Colors + Part1)", "Tile Rendering Mode Cfg (Clear Colors Part2)", and "Tile + Rendering Mode Cfg (Clear Colors Part3)". On those old versions, + the first packet is used to configure 4 render targets. Now that 8 + are supported, invididual per-render-target are used. + * Update ZS clear values packet. + * Add new v71 output formats + * Define Clear Render Targets (Replaces Clear Tile Buffers from v42) + * Redefine GL Shader State Record. Changes copared with v42: + * Fields removed: + * "Coordinate shader has separate input and output VPM blocks" + (reserved bit now) + * "Vertex shader has separate input and output VPM blocks" + (reserved bit now) + * "Address of table of default attribute Values." (we needed to + change the start position for all the following fields) + * New field: + * "Never defer FEP depth writes to fragment shader auto Z writes + on scoreboard conflict" + * Redefine clipper xy scaling: Now it uses 1/64ths of pixels, instead + of 1/256ths + * Update texture shader state. + * Notice we don't use an address type for these fields in the XML + description. This is because the addresses are 64-bit aligned + (even though the PRM doesn't say it) which means the 6 LSB bits + are implicitly 0, but the fields are encoded before the 6th bit + of their starting byte, so we can't use the usual trick we do + with address types where the first 6 bits in the byte are + implicitly overwritten by other fields and we have to encode this + manually as a uint field. This would mean that if we had an + actual BO we would also need to add it manually to the job's + list, but since we don't have one, we don't have to do anything + about it. + * Add new RB_Swap field for texture shader state + * Document Cb/Cr addresses as uint fields in texture shader state + * Fixup Blend Config description: we now support 8 RTs. + * TMU config parameter 2 has new fields + * Add new clipper Z without guardband packet in v71 + * Add enums for the Z clip modes accepted in v71 + * Fix texture state array stride packing for V3D 7.1.5 + +Signed-off-by: Iago Toral Quiroga +Signed-off-by: Alejandro Piñeiro + +broadcom/cle: rb_swap +--- + src/broadcom/cle/v3d_packet_v33.xml | 386 ++++++++++++++++++++++++++-- + 1 file changed, 368 insertions(+), 18 deletions(-) + +diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml +index a0242b5f1c2..624353ca2bf 100644 +--- a/src/broadcom/cle/v3d_packet_v33.xml ++++ b/src/broadcom/cle/v3d_packet_v33.xml +@@ -1,4 +1,4 @@ +- ++ + + + +@@ -167,13 +167,36 @@ + + + +- ++ + + + + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + ++ + + + +@@ -1099,7 +1263,7 @@ + + + +- ++ + + + +@@ -1108,6 +1272,15 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +@@ -1117,7 +1290,7 @@ + + + +- ++ + + + +@@ -1126,6 +1299,19 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +@@ -1135,7 +1321,7 @@ + + + +- ++ + + + +@@ -1144,6 +1330,13 @@ + + + ++ ++ ++ ++ ++ ++ ++ + + + +@@ -1155,7 +1348,7 @@ + + + +- ++ + + + +@@ -1166,6 +1359,13 @@ + + + ++ ++ ++ ++ ++ ++ ++ + + + +@@ -1240,7 +1440,7 @@ + + + +- ++ + + + +@@ -1299,6 +1499,63 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +@@ -1543,7 +1800,7 @@ + + + +- ++ + + + +@@ -1558,6 +1815,23 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +@@ -1611,7 +1885,7 @@ + + + +- ++ + + + +@@ -1652,6 +1926,82 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0004-broadcom-common-retrieve-V3D-revision-number.patch b/projects/RPi/devices/RPi5/patches/mesa/0004-broadcom-common-retrieve-V3D-revision-number.patch new file mode 100644 index 0000000000..6f2fe867f4 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0004-broadcom-common-retrieve-V3D-revision-number.patch @@ -0,0 +1,65 @@ +From 569cbe4229df737ce5915c4be2cad534707fb4f7 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 9 Nov 2021 08:50:51 +0100 +Subject: [PATCH 004/142] broadcom/common: retrieve V3D revision number + +The subrev field from the hub ident3 register is bumped with every +hardware revision doing backwards incompatible changes so we want to +keep track of this. + +Instead of modifying the 'ver' field info to acommodate subrev info, +which would require a lot of changes, simply add a new 'rev' field in +devinfo that we can use when we need to make changes based on the +revision number of a hardware release. +--- + src/broadcom/common/v3d_device_info.c | 14 +++++++++++++- + src/broadcom/common/v3d_device_info.h | 3 +++ + 2 files changed, 16 insertions(+), 1 deletion(-) + +diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c +index 7e0862f1f02..7512fe3a06b 100644 +--- a/src/broadcom/common/v3d_device_info.c ++++ b/src/broadcom/common/v3d_device_info.c +@@ -36,6 +36,9 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i + struct drm_v3d_get_param ident1 = { + .param = DRM_V3D_PARAM_V3D_CORE0_IDENT1, + }; ++ struct drm_v3d_get_param hub_ident3 = { ++ .param = DRM_V3D_PARAM_V3D_HUB_IDENT3, ++ }; + int ret; + + ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0); +@@ -76,5 +79,14 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i + return false; + } + +- return true; ++ ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &hub_ident3); ++ if (ret != 0) { ++ fprintf(stderr, "Couldn't get V3D core HUB IDENT3: %s\n", ++ strerror(errno)); ++ return false; ++ } ++ ++ devinfo->rev = (hub_ident3.value >> 8) & 0xff; ++ ++ return true; + } +diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h +index 97abd9b8d9f..32cb65cf81f 100644 +--- a/src/broadcom/common/v3d_device_info.h ++++ b/src/broadcom/common/v3d_device_info.h +@@ -34,6 +34,9 @@ struct v3d_device_info { + /** Simple V3D version: major * 10 + minor */ + uint8_t ver; + ++ /** V3D revision number */ ++ uint8_t rev; ++ + /** Size of the VPM, in bytes. */ + int vpm_size; + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0005-broadcom-common-add-some-common-v71-helpers.patch b/projects/RPi/devices/RPi5/patches/mesa/0005-broadcom-common-add-some-common-v71-helpers.patch new file mode 100644 index 0000000000..2f07c250d8 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0005-broadcom-common-add-some-common-v71-helpers.patch @@ -0,0 +1,91 @@ +From c260843c882d25bd31e308566b45d4517fda0fa2 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 17 Nov 2021 14:40:47 +0100 +Subject: [PATCH 005/142] broadcom/common: add some common v71 helpers + +--- + src/broadcom/common/v3d_util.c | 27 +++++++++++++++++++++++++++ + src/broadcom/common/v3d_util.h | 27 +++++++++++++++++++++++++++ + 2 files changed, 54 insertions(+) + +diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c +index 57872a923d3..26f5c6b336f 100644 +--- a/src/broadcom/common/v3d_util.c ++++ b/src/broadcom/common/v3d_util.c +@@ -170,3 +170,30 @@ v3d_hw_prim_type(enum mesa_prim prim_type) + unreachable("Unsupported primitive type"); + } + } ++ ++uint32_t ++v3d_internal_bpp_words(uint32_t internal_bpp) ++{ ++ switch (internal_bpp) { ++ case 0 /* V3D_INTERNAL_BPP_32 */: ++ return 1; ++ case 1 /* V3D_INTERNAL_BPP_64 */: ++ return 2; ++ case 2 /* V3D_INTERNAL_BPP_128 */: ++ return 4; ++ default: ++ unreachable("Unsupported internal BPP"); ++ } ++} ++ ++uint32_t ++v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width, ++ uint32_t bpp) ++{ ++ /* stride in multiples of 128 bits, and covers 2 rows. This is the ++ * reason we divide by 2 instead of 4, as we divide number of 32-bit ++ * words per row by 2. ++ */ ++ ++ return (tile_width * bpp) / 2; ++} +diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h +index eb802b77f67..864fc949ffa 100644 +--- a/src/broadcom/common/v3d_util.h ++++ b/src/broadcom/common/v3d_util.h +@@ -24,6 +24,7 @@ + #ifndef V3D_UTIL_H + #define V3D_UTIL_H + ++#include "util/macros.h" + #include "common/v3d_device_info.h" + #include "pipe/p_defines.h" + +@@ -46,4 +47,30 @@ v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle); + uint32_t + v3d_hw_prim_type(enum mesa_prim prim_type); + ++uint32_t ++v3d_internal_bpp_words(uint32_t internal_bpp); ++ ++/* Some configuration packets want the size on log2, but starting at 0 for ++ * size 8. ++ */ ++static inline uint8_t ++log2_tile_size(uint32_t size) ++{ ++ switch(size) { ++ case 8: ++ return 0; ++ case 16: ++ return 1; ++ case 32: ++ return 2; ++ case 64: ++ return 3; ++ default: ++ unreachable("Unsupported tile width/height"); ++ } ++} ++ ++uint32_t ++v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width, ++ uint32_t bpp); + #endif +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0006-broadcom-qpu-add-comments-on-waddr-not-used-on-V3D-7.patch b/projects/RPi/devices/RPi5/patches/mesa/0006-broadcom-qpu-add-comments-on-waddr-not-used-on-V3D-7.patch new file mode 100644 index 0000000000..0250d31af5 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0006-broadcom-qpu-add-comments-on-waddr-not-used-on-V3D-7.patch @@ -0,0 +1,53 @@ +From a5211a4d71acc53183d2a90eb1694d8cce6eb44f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 5 Aug 2021 01:03:11 +0200 +Subject: [PATCH 006/142] broadcom/qpu: add comments on waddr not used on V3D + 7.x + +--- + src/broadcom/qpu/qpu_instr.h | 22 +++++++++++----------- + 1 file changed, 11 insertions(+), 11 deletions(-) + +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index 2e133472698..45a0cad9760 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -88,11 +88,11 @@ enum v3d_qpu_uf { + }; + + enum v3d_qpu_waddr { +- V3D_QPU_WADDR_R0 = 0, +- V3D_QPU_WADDR_R1 = 1, +- V3D_QPU_WADDR_R2 = 2, +- V3D_QPU_WADDR_R3 = 3, +- V3D_QPU_WADDR_R4 = 4, ++ V3D_QPU_WADDR_R0 = 0, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_R1 = 1, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_R2 = 2, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_R3 = 3, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_R4 = 4, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_R5 = 5, + V3D_QPU_WADDR_NOP = 6, + V3D_QPU_WADDR_TLB = 7, +@@ -108,12 +108,12 @@ enum v3d_qpu_waddr { + V3D_QPU_WADDR_SYNC = 16, + V3D_QPU_WADDR_SYNCU = 17, + V3D_QPU_WADDR_SYNCB = 18, +- V3D_QPU_WADDR_RECIP = 19, +- V3D_QPU_WADDR_RSQRT = 20, +- V3D_QPU_WADDR_EXP = 21, +- V3D_QPU_WADDR_LOG = 22, +- V3D_QPU_WADDR_SIN = 23, +- V3D_QPU_WADDR_RSQRT2 = 24, ++ V3D_QPU_WADDR_RECIP = 19, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_RSQRT = 20, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_EXP = 21, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_LOG = 22, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_SIN = 23, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_RSQRT2 = 24, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_TMUC = 32, + V3D_QPU_WADDR_TMUS = 33, + V3D_QPU_WADDR_TMUT = 34, +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0007-broadcom-qpu-set-V3D-7.x-names-for-some-waddr-aliasi.patch b/projects/RPi/devices/RPi5/patches/mesa/0007-broadcom-qpu-set-V3D-7.x-names-for-some-waddr-aliasi.patch new file mode 100644 index 0000000000..2a1a7ae248 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0007-broadcom-qpu-set-V3D-7.x-names-for-some-waddr-aliasi.patch @@ -0,0 +1,60 @@ +From 0ccf3043e4a584e5592bb7fad737d5d98ed23db0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 5 Aug 2021 01:00:47 +0200 +Subject: [PATCH 007/142] broadcom/qpu: set V3D 7.x names for some waddr + aliasing + +V3D 7.x got rid of the accumulator, but still uses the values for +WADDR_R5 and WADDR_R5REP, so let's return a proper name and add some +aliases. +--- + src/broadcom/qpu/qpu_instr.c | 8 ++++++++ + src/broadcom/qpu/qpu_instr.h | 6 ++++-- + 2 files changed, 12 insertions(+), 2 deletions(-) + +diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c +index 60dabf74e8e..7759fb0efdf 100644 +--- a/src/broadcom/qpu/qpu_instr.c ++++ b/src/broadcom/qpu/qpu_instr.c +@@ -35,6 +35,14 @@ v3d_qpu_magic_waddr_name(const struct v3d_device_info *devinfo, + if (devinfo->ver < 40 && waddr == V3D_QPU_WADDR_TMU) + return "tmu"; + ++ /* V3D 7.x QUAD and REP aliases R5 and R5REPT in the table below ++ */ ++ if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_QUAD) ++ return "quad"; ++ ++ if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_REP) ++ return "rep"; ++ + static const char *waddr_magic[] = { + [V3D_QPU_WADDR_R0] = "r0", + [V3D_QPU_WADDR_R1] = "r1", +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index 45a0cad9760..19bf721dbe1 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -93,7 +93,8 @@ enum v3d_qpu_waddr { + V3D_QPU_WADDR_R2 = 2, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_R3 = 3, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_R4 = 4, /* Reserved on V3D 7.x */ +- V3D_QPU_WADDR_R5 = 5, ++ V3D_QPU_WADDR_R5 = 5, /* V3D 4.x */ ++ V3D_QPU_WADDR_QUAD = 5, /* V3D 7.x */ + V3D_QPU_WADDR_NOP = 6, + V3D_QPU_WADDR_TLB = 7, + V3D_QPU_WADDR_TLBU = 8, +@@ -129,7 +130,8 @@ enum v3d_qpu_waddr { + V3D_QPU_WADDR_TMUHSCM = 44, + V3D_QPU_WADDR_TMUHSF = 45, + V3D_QPU_WADDR_TMUHSLOD = 46, +- V3D_QPU_WADDR_R5REP = 55, ++ V3D_QPU_WADDR_R5REP = 55, /* V3D 4.x */ ++ V3D_QPU_WADDR_REP = 55, /* V3D 7.x */ + }; + + struct v3d_qpu_flags { +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0008-broadcom-compiler-rename-small_imm-to-small_imm_b.patch b/projects/RPi/devices/RPi5/patches/mesa/0008-broadcom-compiler-rename-small_imm-to-small_imm_b.patch new file mode 100644 index 0000000000..96d81a2c1a --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0008-broadcom-compiler-rename-small_imm-to-small_imm_b.patch @@ -0,0 +1,241 @@ +From 18de3cc85cf8bbe294e044f7a12abe14e554de0a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Sun, 19 Sep 2021 03:20:18 +0200 +Subject: [PATCH 008/142] broadcom/compiler: rename small_imm to small_imm_b + +Current small_imm is associated with the "B" read address. + +We do this change in advance for v71 support, where we will have 4 +different small_imm (a/b/c/d), so we start with a renaming. +--- + src/broadcom/compiler/qpu_schedule.c | 22 +++++++++---------- + .../compiler/vir_opt_small_immediates.c | 4 ++-- + src/broadcom/compiler/vir_to_qpu.c | 2 +- + src/broadcom/qpu/qpu_disasm.c | 2 +- + src/broadcom/qpu/qpu_instr.h | 2 +- + src/broadcom/qpu/qpu_pack.c | 22 +++++++++---------- + 6 files changed, 27 insertions(+), 27 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index 3b32b48f86f..a10fa03ed10 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -160,7 +160,7 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n, + add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); + break; + case V3D_QPU_MUX_B: +- if (!n->inst->qpu.sig.small_imm) { ++ if (!n->inst->qpu.sig.small_imm_b) { + add_read_dep(state, + state->last_rf[n->inst->qpu.raddr_b], n); + } +@@ -615,7 +615,7 @@ qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, + return true; + + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && +- !inst->sig.small_imm && (inst->raddr_b == waddr)) ++ !inst->sig.small_imm_b && (inst->raddr_b == waddr)) + return true; + + return false; +@@ -790,11 +790,11 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a, + uint64_t raddrs_used = 0; + if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A)) + raddrs_used |= (1ll << a->raddr_a); +- if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) ++ if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) + raddrs_used |= (1ll << a->raddr_b); + if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) + raddrs_used |= (1ll << b->raddr_a); +- if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) ++ if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) + raddrs_used |= (1ll << b->raddr_b); + + return raddrs_used; +@@ -816,16 +816,16 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, + if (naddrs > 2) + return false; + +- if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) { ++ if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) { + if (naddrs > 1) + return false; + +- if (add_instr->sig.small_imm && mul_instr->sig.small_imm) ++ if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b) + if (add_instr->raddr_b != mul_instr->raddr_b) + return false; + +- result->sig.small_imm = true; +- result->raddr_b = add_instr->sig.small_imm ? ++ result->sig.small_imm_b = true; ++ result->raddr_b = add_instr->sig.small_imm_b ? + add_instr->raddr_b : mul_instr->raddr_b; + } + +@@ -836,7 +836,7 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, + raddrs_used &= ~(1ll << raddr_a); + result->raddr_a = raddr_a; + +- if (!result->sig.small_imm) { ++ if (!result->sig.small_imm_b) { + if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) && + raddr_a == add_instr->raddr_b) { + if (add_instr->alu.add.a == V3D_QPU_MUX_B) +@@ -1025,7 +1025,7 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, + merge.sig.ldtmu |= b->sig.ldtmu; + merge.sig.ldvary |= b->sig.ldvary; + merge.sig.ldvpm |= b->sig.ldvpm; +- merge.sig.small_imm |= b->sig.small_imm; ++ merge.sig.small_imm_b |= b->sig.small_imm_b; + merge.sig.ldtlb |= b->sig.ldtlb; + merge.sig.ldtlbu |= b->sig.ldtlbu; + merge.sig.ucb |= b->sig.ucb; +@@ -1614,7 +1614,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, + return false; + + if (inst->raddr_b < 3 && +- !inst->sig.small_imm && ++ !inst->sig.small_imm_b && + v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) { + return false; + } +diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c +index 47d7722968d..df0d6c36c9b 100644 +--- a/src/broadcom/compiler/vir_opt_small_immediates.c ++++ b/src/broadcom/compiler/vir_opt_small_immediates.c +@@ -80,7 +80,7 @@ vir_opt_small_immediates(struct v3d_compile *c) + */ + struct v3d_qpu_sig new_sig = inst->qpu.sig; + uint32_t sig_packed; +- new_sig.small_imm = true; ++ new_sig.small_imm_b = true; + if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed)) + continue; + +@@ -89,7 +89,7 @@ vir_opt_small_immediates(struct v3d_compile *c) + vir_dump_inst(c, inst); + fprintf(stderr, "\n"); + } +- inst->qpu.sig.small_imm = true; ++ inst->qpu.sig.small_imm_b = true; + inst->qpu.raddr_b = packed; + + inst->src[i].file = QFILE_SMALL_IMM; +diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c +index 45e6bfa1470..15c2e3674c2 100644 +--- a/src/broadcom/compiler/vir_to_qpu.c ++++ b/src/broadcom/compiler/vir_to_qpu.c +@@ -94,7 +94,7 @@ static void + set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) + { + if (src.smimm) { +- assert(instr->sig.small_imm); ++ assert(instr->sig.small_imm_b); + *mux = V3D_QPU_MUX_B; + return; + } +diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c +index 28fb2357b97..6aca3c28e78 100644 +--- a/src/broadcom/qpu/qpu_disasm.c ++++ b/src/broadcom/qpu/qpu_disasm.c +@@ -62,7 +62,7 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm, + if (mux == V3D_QPU_MUX_A) { + append(disasm, "rf%d", instr->raddr_a); + } else if (mux == V3D_QPU_MUX_B) { +- if (instr->sig.small_imm) { ++ if (instr->sig.small_imm_b) { + uint32_t val; + ASSERTED bool ok = + v3d_qpu_small_imm_unpack(disasm->devinfo, +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index 19bf721dbe1..9cd831863b4 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -50,7 +50,7 @@ struct v3d_qpu_sig { + bool ldvpm:1; + bool ldtlb:1; + bool ldtlbu:1; +- bool small_imm:1; ++ bool small_imm_b:1; + bool ucb:1; + bool rotate:1; + bool wrtmuc:1; +diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c +index a875683c6f8..beac591d3c1 100644 +--- a/src/broadcom/qpu/qpu_pack.c ++++ b/src/broadcom/qpu/qpu_pack.c +@@ -112,7 +112,7 @@ + #define LDTMU .ldtmu = true + #define LDVARY .ldvary = true + #define LDVPM .ldvpm = true +-#define SMIMM .small_imm = true ++#define SMIMM_B .small_imm_b = true + #define LDTLB .ldtlb = true + #define LDTLBU .ldtlbu = true + #define UCB .ucb = true +@@ -135,8 +135,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = { + [11] = { THRSW, LDVARY, LDUNIF }, + [12] = { LDVARY, LDTMU, }, + [13] = { THRSW, LDVARY, LDTMU, }, +- [14] = { SMIMM, LDVARY, }, +- [15] = { SMIMM, }, ++ [14] = { SMIMM_B, LDVARY, }, ++ [15] = { SMIMM_B, }, + [16] = { LDTLB, }, + [17] = { LDTLBU, }, + /* 18-21 reserved */ +@@ -148,8 +148,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = { + [27] = { THRSW, LDVPM, LDUNIF }, + [28] = { LDVPM, LDTMU, }, + [29] = { THRSW, LDVPM, LDTMU, }, +- [30] = { SMIMM, LDVPM, }, +- [31] = { SMIMM, }, ++ [30] = { SMIMM_B, LDVPM, }, ++ [31] = { SMIMM_B, }, + }; + + static const struct v3d_qpu_sig v40_sig_map[] = { +@@ -167,8 +167,8 @@ static const struct v3d_qpu_sig v40_sig_map[] = { + [10] = { LDVARY, LDUNIF }, + [11] = { THRSW, LDVARY, LDUNIF }, + /* 12-13 reserved */ +- [14] = { SMIMM, LDVARY, }, +- [15] = { SMIMM, }, ++ [14] = { SMIMM_B, LDVARY, }, ++ [15] = { SMIMM_B, }, + [16] = { LDTLB, }, + [17] = { LDTLBU, }, + [18] = { WRTMUC }, +@@ -178,7 +178,7 @@ static const struct v3d_qpu_sig v40_sig_map[] = { + [22] = { UCB, }, + [23] = { ROT, }, + /* 24-30 reserved */ +- [31] = { SMIMM, LDTMU, }, ++ [31] = { SMIMM_B, LDTMU, }, + }; + + static const struct v3d_qpu_sig v41_sig_map[] = { +@@ -197,8 +197,8 @@ static const struct v3d_qpu_sig v41_sig_map[] = { + [11] = { THRSW, LDVARY, LDUNIF }, + [12] = { LDUNIFRF }, + [13] = { THRSW, LDUNIFRF }, +- [14] = { SMIMM, LDVARY, }, +- [15] = { SMIMM, }, ++ [14] = { SMIMM_B, LDVARY }, ++ [15] = { SMIMM_B, }, + [16] = { LDTLB, }, + [17] = { LDTLBU, }, + [18] = { WRTMUC }, +@@ -210,7 +210,7 @@ static const struct v3d_qpu_sig v41_sig_map[] = { + [24] = { LDUNIFA}, + [25] = { LDUNIFARF }, + /* 26-30 reserved */ +- [31] = { SMIMM, LDTMU, }, ++ [31] = { SMIMM_B, LDTMU, }, + }; + + bool +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0009-broadcom-compiler-add-small_imm-a-c-d-on-v3d_qpu_sig.patch b/projects/RPi/devices/RPi5/patches/mesa/0009-broadcom-compiler-add-small_imm-a-c-d-on-v3d_qpu_sig.patch new file mode 100644 index 0000000000..02e8c47d7e --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0009-broadcom-compiler-add-small_imm-a-c-d-on-v3d_qpu_sig.patch @@ -0,0 +1,53 @@ +From 0e87405fe73694c173b7ce14c3d60611f241922c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 5 Aug 2021 00:50:12 +0200 +Subject: [PATCH 009/142] broadcom/compiler: add small_imm a/c/d on v3d_qpu_sig + +small_imm_a, small_imm_c and small_imm_d added on top of the already +existing small_imm_b, as V3D 7.1 defines 4 small immediates, tied to +the 4 raddr. Note that this is only the definition, and just a inst +validation rule to check that are not used before v71. Any real use is +still pending. +--- + src/broadcom/compiler/qpu_validate.c | 5 +++++ + src/broadcom/qpu/qpu_instr.h | 5 ++++- + 2 files changed, 9 insertions(+), 1 deletion(-) + +diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c +index 2cc7a0eb0ae..12788692432 100644 +--- a/src/broadcom/compiler/qpu_validate.c ++++ b/src/broadcom/compiler/qpu_validate.c +@@ -115,6 +115,11 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) + return; + ++ if (devinfo->ver < 71) { ++ if (inst->sig.small_imm_a || inst->sig.small_imm_c || inst->sig.small_imm_d) ++ fail_instr(state, "small imm a/c/d added after V3D 7.1"); ++ } ++ + /* LDVARY writes r5 two instructions later and LDUNIF writes + * r5 one instruction later, which is illegal to have + * together. +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index 9cd831863b4..13b3f37d43f 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -50,10 +50,13 @@ struct v3d_qpu_sig { + bool ldvpm:1; + bool ldtlb:1; + bool ldtlbu:1; +- bool small_imm_b:1; + bool ucb:1; + bool rotate:1; + bool wrtmuc:1; ++ bool small_imm_a:1; /* raddr_a (add a), since V3D 7.x */ ++ bool small_imm_b:1; /* raddr_b (add b) */ ++ bool small_imm_c:1; /* raddr_c (mul a), since V3D 7.x */ ++ bool small_imm_d:1; /* raddr_d (mul b), since V3D 7.x */ + }; + + enum v3d_qpu_cond { +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0010-broadcom-qpu-add-v71-signal-map.patch b/projects/RPi/devices/RPi5/patches/mesa/0010-broadcom-qpu-add-v71-signal-map.patch new file mode 100644 index 0000000000..a2d2598b9f --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0010-broadcom-qpu-add-v71-signal-map.patch @@ -0,0 +1,106 @@ +From eca19c911d9af3b0ab3b563ea65dc455e3d27987 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 4 Aug 2021 01:11:16 +0200 +Subject: [PATCH 010/142] broadcom/qpu: add v71 signal map +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Compared with v41, the differences are: + * 14, 15, 29 and 30 are now about immediate a, b, c, d respectively + * 23 is now reserved. On v42 this was for rotate signals, that are + gone on v71. + +Signed-off-by: Alejandro Piñeiro +Signed-off-by: Iago Toral Quiroga +--- + src/broadcom/qpu/qpu_pack.c | 47 ++++++++++++++++++++++++++++++++++--- + 1 file changed, 44 insertions(+), 3 deletions(-) + +diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c +index beac591d3c1..2820d9d4c56 100644 +--- a/src/broadcom/qpu/qpu_pack.c ++++ b/src/broadcom/qpu/qpu_pack.c +@@ -112,12 +112,15 @@ + #define LDTMU .ldtmu = true + #define LDVARY .ldvary = true + #define LDVPM .ldvpm = true +-#define SMIMM_B .small_imm_b = true + #define LDTLB .ldtlb = true + #define LDTLBU .ldtlbu = true + #define UCB .ucb = true + #define ROT .rotate = true + #define WRTMUC .wrtmuc = true ++#define SMIMM_A .small_imm_a = true ++#define SMIMM_B .small_imm_b = true ++#define SMIMM_C .small_imm_c = true ++#define SMIMM_D .small_imm_d = true + + static const struct v3d_qpu_sig v33_sig_map[] = { + /* MISC R3 R4 R5 */ +@@ -213,6 +216,40 @@ static const struct v3d_qpu_sig v41_sig_map[] = { + [31] = { SMIMM_B, LDTMU, }, + }; + ++ ++static const struct v3d_qpu_sig v71_sig_map[] = { ++ /* MISC phys RF0 */ ++ [0] = { }, ++ [1] = { THRSW, }, ++ [2] = { LDUNIF }, ++ [3] = { THRSW, LDUNIF }, ++ [4] = { LDTMU, }, ++ [5] = { THRSW, LDTMU, }, ++ [6] = { LDTMU, LDUNIF }, ++ [7] = { THRSW, LDTMU, LDUNIF }, ++ [8] = { LDVARY, }, ++ [9] = { THRSW, LDVARY, }, ++ [10] = { LDVARY, LDUNIF }, ++ [11] = { THRSW, LDVARY, LDUNIF }, ++ [12] = { LDUNIFRF }, ++ [13] = { THRSW, LDUNIFRF }, ++ [14] = { SMIMM_A, }, ++ [15] = { SMIMM_B, }, ++ [16] = { LDTLB, }, ++ [17] = { LDTLBU, }, ++ [18] = { WRTMUC }, ++ [19] = { THRSW, WRTMUC }, ++ [20] = { LDVARY, WRTMUC }, ++ [21] = { THRSW, LDVARY, WRTMUC }, ++ [22] = { UCB, }, ++ /* 23 reserved */ ++ [24] = { LDUNIFA}, ++ [25] = { LDUNIFARF }, ++ /* 26-29 reserved */ ++ [30] = { SMIMM_C, }, ++ [31] = { SMIMM_D, }, ++}; ++ + bool + v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo, + uint32_t packed_sig, +@@ -221,7 +258,9 @@ v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo, + if (packed_sig >= ARRAY_SIZE(v33_sig_map)) + return false; + +- if (devinfo->ver >= 41) ++ if (devinfo->ver >= 71) ++ *sig = v71_sig_map[packed_sig]; ++ else if (devinfo->ver >= 41) + *sig = v41_sig_map[packed_sig]; + else if (devinfo->ver == 40) + *sig = v40_sig_map[packed_sig]; +@@ -240,7 +279,9 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo, + { + static const struct v3d_qpu_sig *map; + +- if (devinfo->ver >= 41) ++ if (devinfo->ver >= 71) ++ map = v71_sig_map; ++ else if (devinfo->ver >= 41) + map = v41_sig_map; + else if (devinfo->ver == 40) + map = v40_sig_map; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0011-broadcom-qpu-define-v3d_qpu_input-use-on-v3d_qpu_alu.patch b/projects/RPi/devices/RPi5/patches/mesa/0011-broadcom-qpu-define-v3d_qpu_input-use-on-v3d_qpu_alu.patch new file mode 100644 index 0000000000..d5813b8c05 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0011-broadcom-qpu-define-v3d_qpu_input-use-on-v3d_qpu_alu.patch @@ -0,0 +1,778 @@ +From d10e67a396d713ec81fb133f3516e09fe1e067b6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Fri, 6 Aug 2021 01:22:31 +0200 +Subject: [PATCH 011/142] broadcom/qpu: define v3d_qpu_input, use on + v3d_qpu_alu_instr +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +At this point it just tidy up a little the alu_instr structure. + +But also serves to prepare the structure for new changes, as 7.x uses +raddr instead of mux, and it is just easier to add the raddr to the +new input structure. + +Signed-off-by: Alejandro Piñeiro +Signed-off-by: Iago Toral Quiroga +--- + src/broadcom/compiler/qpu_schedule.c | 65 +++++++-------- + src/broadcom/compiler/vir.c | 16 ++-- + src/broadcom/compiler/vir_dump.c | 8 +- + .../compiler/vir_opt_copy_propagate.c | 12 +-- + .../compiler/vir_opt_redundant_flags.c | 8 +- + src/broadcom/compiler/vir_to_qpu.c | 30 +++---- + src/broadcom/qpu/qpu_disasm.c | 16 ++-- + src/broadcom/qpu/qpu_instr.c | 8 +- + src/broadcom/qpu/qpu_instr.h | 13 +-- + src/broadcom/qpu/qpu_pack.c | 82 +++++++++---------- + src/broadcom/qpu/tests/qpu_disasm.c | 8 +- + 11 files changed, 134 insertions(+), 132 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index a10fa03ed10..455fa3867be 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -306,14 +306,14 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) + /* XXX: LOAD_IMM */ + + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) +- process_mux_deps(state, n, inst->alu.add.a); ++ process_mux_deps(state, n, inst->alu.add.a.mux); + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) +- process_mux_deps(state, n, inst->alu.add.b); ++ process_mux_deps(state, n, inst->alu.add.b.mux); + + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) +- process_mux_deps(state, n, inst->alu.mul.a); ++ process_mux_deps(state, n, inst->alu.mul.a.mux); + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) +- process_mux_deps(state, n, inst->alu.mul.b); ++ process_mux_deps(state, n, inst->alu.mul.b.mux); + + switch (inst->alu.add.op) { + case V3D_QPU_A_VPMSETUP: +@@ -537,22 +537,22 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, + + if (inst->alu.add.op != V3D_QPU_A_NOP) { + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && +- mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { ++ mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux)) { + return true; + } + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && +- mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { ++ mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux)) { + return true; + } + } + + if (inst->alu.mul.op != V3D_QPU_M_NOP) { + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && +- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { ++ mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux)) { + return true; + } + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && +- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { ++ mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux)) { + return true; + } + } +@@ -839,20 +839,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, + if (!result->sig.small_imm_b) { + if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) && + raddr_a == add_instr->raddr_b) { +- if (add_instr->alu.add.a == V3D_QPU_MUX_B) +- result->alu.add.a = V3D_QPU_MUX_A; +- if (add_instr->alu.add.b == V3D_QPU_MUX_B && ++ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B) ++ result->alu.add.a.mux = V3D_QPU_MUX_A; ++ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B && + v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { +- result->alu.add.b = V3D_QPU_MUX_A; ++ result->alu.add.b.mux = V3D_QPU_MUX_A; + } + } + if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) && + raddr_a == mul_instr->raddr_b) { +- if (mul_instr->alu.mul.a == V3D_QPU_MUX_B) +- result->alu.mul.a = V3D_QPU_MUX_A; +- if (mul_instr->alu.mul.b == V3D_QPU_MUX_B && ++ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B) ++ result->alu.mul.a.mux = V3D_QPU_MUX_A; ++ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B && + v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { +- result->alu.mul.b = V3D_QPU_MUX_A; ++ result->alu.mul.b.mux = V3D_QPU_MUX_A; + } + } + } +@@ -863,20 +863,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, + result->raddr_b = raddr_b; + if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) && + raddr_b == add_instr->raddr_a) { +- if (add_instr->alu.add.a == V3D_QPU_MUX_A) +- result->alu.add.a = V3D_QPU_MUX_B; +- if (add_instr->alu.add.b == V3D_QPU_MUX_A && ++ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A) ++ result->alu.add.a.mux = V3D_QPU_MUX_B; ++ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A && + v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { +- result->alu.add.b = V3D_QPU_MUX_B; ++ result->alu.add.b.mux = V3D_QPU_MUX_B; + } + } + if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) && + raddr_b == mul_instr->raddr_a) { +- if (mul_instr->alu.mul.a == V3D_QPU_MUX_A) +- result->alu.mul.a = V3D_QPU_MUX_B; +- if (mul_instr->alu.mul.b == V3D_QPU_MUX_A && ++ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A) ++ result->alu.mul.a.mux = V3D_QPU_MUX_B; ++ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A && + v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { +- result->alu.mul.b = V3D_QPU_MUX_B; ++ result->alu.mul.b.mux = V3D_QPU_MUX_B; + } + } + +@@ -927,11 +927,12 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) + inst->flags.auf = V3D_QPU_UF_NONE; + + inst->alu.mul.output_pack = inst->alu.add.output_pack; +- inst->alu.mul.a_unpack = inst->alu.add.a_unpack; +- inst->alu.mul.b_unpack = inst->alu.add.b_unpack; ++ ++ inst->alu.mul.a.unpack = inst->alu.add.a.unpack; ++ inst->alu.mul.b.unpack = inst->alu.add.b.unpack; + inst->alu.add.output_pack = V3D_QPU_PACK_NONE; +- inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE; +- inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE; ++ inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; ++ inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + } + + static bool +@@ -2064,12 +2065,12 @@ alu_reads_register(struct v3d_qpu_instr *inst, + + if (add) { + num_src = v3d_qpu_add_op_num_src(inst->alu.add.op); +- mux_a = inst->alu.add.a; +- mux_b = inst->alu.add.b; ++ mux_a = inst->alu.add.a.mux; ++ mux_b = inst->alu.add.b.mux; + } else { + num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op); +- mux_a = inst->alu.mul.a; +- mux_b = inst->alu.mul.b; ++ mux_a = inst->alu.mul.a.mux; ++ mux_b = inst->alu.mul.b.mux; + } + + for (int i = 0; i < num_src; i++) { +diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c +index 660b11b0577..007cb0a941b 100644 +--- a/src/broadcom/compiler/vir.c ++++ b/src/broadcom/compiler/vir.c +@@ -113,10 +113,10 @@ vir_is_raw_mov(struct qinst *inst) + return false; + } + +- if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE || +- inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE || +- inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || +- inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) { ++ if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE || ++ inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE || ++ inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || ++ inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) { + return false; + } + +@@ -209,15 +209,15 @@ vir_set_unpack(struct qinst *inst, int src, + + if (vir_is_add(inst)) { + if (src == 0) +- inst->qpu.alu.add.a_unpack = unpack; ++ inst->qpu.alu.add.a.unpack = unpack; + else +- inst->qpu.alu.add.b_unpack = unpack; ++ inst->qpu.alu.add.b.unpack = unpack; + } else { + assert(vir_is_mul(inst)); + if (src == 0) +- inst->qpu.alu.mul.a_unpack = unpack; ++ inst->qpu.alu.mul.a.unpack = unpack; + else +- inst->qpu.alu.mul.b_unpack = unpack; ++ inst->qpu.alu.mul.b.unpack = unpack; + } + } + +diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c +index 5c47bbdc1b0..ab5d4043039 100644 +--- a/src/broadcom/compiler/vir_dump.c ++++ b/src/broadcom/compiler/vir_dump.c +@@ -270,8 +270,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst) + vir_print_reg(c, inst, inst->dst); + fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack)); + +- unpack[0] = instr->alu.add.a_unpack; +- unpack[1] = instr->alu.add.b_unpack; ++ unpack[0] = instr->alu.add.a.unpack; ++ unpack[1] = instr->alu.add.b.unpack; + } else { + fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op)); + fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc)); +@@ -282,8 +282,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst) + vir_print_reg(c, inst, inst->dst); + fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack)); + +- unpack[0] = instr->alu.mul.a_unpack; +- unpack[1] = instr->alu.mul.b_unpack; ++ unpack[0] = instr->alu.mul.a.unpack; ++ unpack[1] = instr->alu.mul.b.unpack; + } + + for (int i = 0; i < nsrc; i++) { +diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c +index da121c2a5bd..c4aa7255a17 100644 +--- a/src/broadcom/compiler/vir_opt_copy_propagate.c ++++ b/src/broadcom/compiler/vir_opt_copy_propagate.c +@@ -104,14 +104,14 @@ vir_has_unpack(struct qinst *inst, int chan) + + if (vir_is_add(inst)) { + if (chan == 0) +- return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE; ++ return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE; + else +- return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE; ++ return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE; + } else { + if (chan == 0) +- return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE; ++ return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE; + else +- return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE; ++ return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE; + } + } + +@@ -161,7 +161,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) + continue; + + /* these ops can't represent abs. */ +- if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) { ++ if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) { + switch (inst->qpu.alu.add.op) { + case V3D_QPU_A_VFPACK: + case V3D_QPU_A_FROUND: +@@ -189,7 +189,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) + + inst->src[i] = mov->src[0]; + if (vir_has_unpack(mov, 0)) { +- enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack; ++ enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack; + + vir_set_unpack(inst, i, unpack); + } +diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c +index c7896d57f2b..6b61ed6a39a 100644 +--- a/src/broadcom/compiler/vir_opt_redundant_flags.c ++++ b/src/broadcom/compiler/vir_opt_redundant_flags.c +@@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b) + a->qpu.flags.mpf != b->qpu.flags.mpf || + a->qpu.alu.add.op != b->qpu.alu.add.op || + a->qpu.alu.mul.op != b->qpu.alu.mul.op || +- a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack || +- a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack || ++ a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack || ++ a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack || + a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack || +- a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack || +- a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack || ++ a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack || ++ a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack || + a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) { + return false; + } +diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c +index 15c2e3674c2..c8b6e0a91a0 100644 +--- a/src/broadcom/compiler/vir_to_qpu.c ++++ b/src/broadcom/compiler/vir_to_qpu.c +@@ -106,20 +106,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) + return; + } + +- if (instr->alu.add.a != V3D_QPU_MUX_A && +- instr->alu.add.b != V3D_QPU_MUX_A && +- instr->alu.mul.a != V3D_QPU_MUX_A && +- instr->alu.mul.b != V3D_QPU_MUX_A) { ++ if (instr->alu.add.a.mux != V3D_QPU_MUX_A && ++ instr->alu.add.b.mux != V3D_QPU_MUX_A && ++ instr->alu.mul.a.mux != V3D_QPU_MUX_A && ++ instr->alu.mul.b.mux != V3D_QPU_MUX_A) { + instr->raddr_a = src.index; + *mux = V3D_QPU_MUX_A; + } else { + if (instr->raddr_a == src.index) { + *mux = V3D_QPU_MUX_A; + } else { +- assert(!(instr->alu.add.a == V3D_QPU_MUX_B && +- instr->alu.add.b == V3D_QPU_MUX_B && +- instr->alu.mul.a == V3D_QPU_MUX_B && +- instr->alu.mul.b == V3D_QPU_MUX_B) || ++ assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B && ++ instr->alu.add.b.mux == V3D_QPU_MUX_B && ++ instr->alu.mul.a.mux == V3D_QPU_MUX_B && ++ instr->alu.mul.b.mux == V3D_QPU_MUX_B) || + src.index == instr->raddr_b); + + instr->raddr_b = src.index; +@@ -147,14 +147,14 @@ is_no_op_mov(struct qinst *qinst) + if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4) + return false; + +- if (qinst->qpu.alu.mul.a != ++ if (qinst->qpu.alu.mul.a.mux != + V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) { + return false; + } + } else { + int raddr; + +- switch (qinst->qpu.alu.mul.a) { ++ switch (qinst->qpu.alu.mul.a.mux) { + case V3D_QPU_MUX_A: + raddr = qinst->qpu.raddr_a; + break; +@@ -171,7 +171,7 @@ is_no_op_mov(struct qinst *qinst) + /* No packing or flags updates, or we need to execute the + * instruction. + */ +- if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || ++ if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || + qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE || + qinst->qpu.flags.mc != V3D_QPU_COND_NONE || + qinst->qpu.flags.mpf != V3D_QPU_PF_NONE || +@@ -302,11 +302,11 @@ v3d_generate_code_block(struct v3d_compile *c, + assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); + if (nsrc >= 1) { + set_src(&qinst->qpu, +- &qinst->qpu.alu.add.a, src[0]); ++ &qinst->qpu.alu.add.a.mux, src[0]); + } + if (nsrc >= 2) { + set_src(&qinst->qpu, +- &qinst->qpu.alu.add.b, src[1]); ++ &qinst->qpu.alu.add.b.mux, src[1]); + } + + qinst->qpu.alu.add.waddr = dst.index; +@@ -314,11 +314,11 @@ v3d_generate_code_block(struct v3d_compile *c, + } else { + if (nsrc >= 1) { + set_src(&qinst->qpu, +- &qinst->qpu.alu.mul.a, src[0]); ++ &qinst->qpu.alu.mul.a.mux, src[0]); + } + if (nsrc >= 2) { + set_src(&qinst->qpu, +- &qinst->qpu.alu.mul.b, src[1]); ++ &qinst->qpu.alu.mul.b.mux, src[1]); + } + + qinst->qpu.alu.mul.waddr = dst.index; +diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c +index 6aca3c28e78..588a665f770 100644 +--- a/src/broadcom/qpu/qpu_disasm.c ++++ b/src/broadcom/qpu/qpu_disasm.c +@@ -121,16 +121,16 @@ v3d_qpu_disasm_add(struct disasm_state *disasm, + if (num_src >= 1) { + if (has_dst) + append(disasm, ", "); +- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a); ++ v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a.mux); + append(disasm, "%s", +- v3d_qpu_unpack_name(instr->alu.add.a_unpack)); ++ v3d_qpu_unpack_name(instr->alu.add.a.unpack)); + } + + if (num_src >= 2) { + append(disasm, ", "); +- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b); ++ v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b.mux); + append(disasm, "%s", +- v3d_qpu_unpack_name(instr->alu.add.b_unpack)); ++ v3d_qpu_unpack_name(instr->alu.add.b.unpack)); + } + } + +@@ -164,16 +164,16 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm, + if (num_src >= 1) { + if (has_dst) + append(disasm, ", "); +- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a); ++ v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a.mux); + append(disasm, "%s", +- v3d_qpu_unpack_name(instr->alu.mul.a_unpack)); ++ v3d_qpu_unpack_name(instr->alu.mul.a.unpack)); + } + + if (num_src >= 2) { + append(disasm, ", "); +- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b); ++ v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b.mux); + append(disasm, "%s", +- v3d_qpu_unpack_name(instr->alu.mul.b_unpack)); ++ v3d_qpu_unpack_name(instr->alu.mul.b.unpack)); + } + } + +diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c +index 7759fb0efdf..7ece8b5e570 100644 +--- a/src/broadcom/qpu/qpu_instr.c ++++ b/src/broadcom/qpu/qpu_instr.c +@@ -926,10 +926,10 @@ v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) + int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op); + int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op); + +- return ((add_nsrc > 0 && inst->alu.add.a == mux) || +- (add_nsrc > 1 && inst->alu.add.b == mux) || +- (mul_nsrc > 0 && inst->alu.mul.a == mux) || +- (mul_nsrc > 1 && inst->alu.mul.b == mux)); ++ return ((add_nsrc > 0 && inst->alu.add.a.mux == mux) || ++ (add_nsrc > 1 && inst->alu.add.b.mux == mux) || ++ (mul_nsrc > 0 && inst->alu.mul.a.mux == mux) || ++ (mul_nsrc > 1 && inst->alu.mul.b.mux == mux)); + } + + bool +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index 13b3f37d43f..53a51bfb3e1 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -294,25 +294,26 @@ enum v3d_qpu_mux { + V3D_QPU_MUX_B, + }; + ++struct v3d_qpu_input { ++ enum v3d_qpu_mux mux; ++ enum v3d_qpu_input_unpack unpack; ++}; ++ + struct v3d_qpu_alu_instr { + struct { + enum v3d_qpu_add_op op; +- enum v3d_qpu_mux a, b; ++ struct v3d_qpu_input a, b; + uint8_t waddr; + bool magic_write; + enum v3d_qpu_output_pack output_pack; +- enum v3d_qpu_input_unpack a_unpack; +- enum v3d_qpu_input_unpack b_unpack; + } add; + + struct { + enum v3d_qpu_mul_op op; +- enum v3d_qpu_mux a, b; ++ struct v3d_qpu_input a, b; + uint8_t waddr; + bool magic_write; + enum v3d_qpu_output_pack output_pack; +- enum v3d_qpu_input_unpack a_unpack; +- enum v3d_qpu_input_unpack b_unpack; + } mul; + }; + +diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c +index 2820d9d4c56..6e975793fc0 100644 +--- a/src/broadcom/qpu/qpu_pack.c ++++ b/src/broadcom/qpu/qpu_pack.c +@@ -853,12 +853,12 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + + if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, +- &instr->alu.add.a_unpack)) { ++ &instr->alu.add.a.unpack)) { + return false; + } + + if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, +- &instr->alu.add.b_unpack)) { ++ &instr->alu.add.b.unpack)) { + return false; + } + break; +@@ -872,7 +872,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + instr->alu.add.output_pack = mux_b & 0x3; + + if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, +- &instr->alu.add.a_unpack)) { ++ &instr->alu.add.a.unpack)) { + return false; + } + break; +@@ -884,7 +884,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + + if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, +- &instr->alu.add.a_unpack)) { ++ &instr->alu.add.a.unpack)) { + return false; + } + break; +@@ -892,23 +892,23 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + case V3D_QPU_A_VFMIN: + case V3D_QPU_A_VFMAX: + if (!v3d_qpu_float16_unpack_unpack(op & 0x7, +- &instr->alu.add.a_unpack)) { ++ &instr->alu.add.a.unpack)) { + return false; + } + + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; +- instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + break; + + default: + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; +- instr->alu.add.a_unpack = V3D_QPU_UNPACK_NONE; +- instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + break; + } + +- instr->alu.add.a = mux_a; +- instr->alu.add.b = mux_b; ++ instr->alu.add.a.mux = mux_a; ++ instr->alu.add.b.mux = mux_b; + instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A); + + instr->alu.add.magic_write = false; +@@ -956,12 +956,12 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1; + + if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, +- &instr->alu.mul.a_unpack)) { ++ &instr->alu.mul.a.unpack)) { + return false; + } + + if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, +- &instr->alu.mul.b_unpack)) { ++ &instr->alu.mul.b.unpack)) { + return false; + } + +@@ -972,7 +972,7 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + ((mux_b >> 2) & 1)); + + if (!v3d_qpu_float32_unpack_unpack(mux_b & 0x3, +- &instr->alu.mul.a_unpack)) { ++ &instr->alu.mul.a.unpack)) { + return false; + } + +@@ -982,23 +982,23 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; + + if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7, +- &instr->alu.mul.a_unpack)) { ++ &instr->alu.mul.a.unpack)) { + return false; + } + +- instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; + + break; + + default: + instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; +- instr->alu.mul.a_unpack = V3D_QPU_UNPACK_NONE; +- instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; + break; + } + +- instr->alu.mul.a = mux_a; +- instr->alu.mul.b = mux_b; ++ instr->alu.mul.a.mux = mux_a; ++ instr->alu.mul.b.mux = mux_b; + instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M); + instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM; + +@@ -1030,8 +1030,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr, uint64_t *packed_instr) + { + uint32_t waddr = instr->alu.add.waddr; +- uint32_t mux_a = instr->alu.add.a; +- uint32_t mux_b = instr->alu.add.b; ++ uint32_t mux_a = instr->alu.add.a.mux; ++ uint32_t mux_b = instr->alu.add.b.mux; + int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op); + const struct opcode_desc *desc = + lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops), +@@ -1102,12 +1102,12 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + } + opcode |= output_pack << 4; + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &a_unpack)) { + return false; + } + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, + &b_unpack)) { + return false; + } +@@ -1141,17 +1141,17 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + uint32_t a_unpack; + uint32_t b_unpack; + +- if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS || +- instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) { ++ if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS || ++ instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) { + return false; + } + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &a_unpack)) { + return false; + } + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, + &b_unpack)) { + return false; + } +@@ -1176,7 +1176,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + } + mux_b |= packed; + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } +@@ -1194,7 +1194,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + return false; + + uint32_t packed; +- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } +@@ -1207,11 +1207,11 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + case V3D_QPU_A_VFMIN: + case V3D_QPU_A_VFMAX: + if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || +- instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE) { ++ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) { + return false; + } + +- if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a_unpack, ++ if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } +@@ -1221,8 +1221,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + default: + if (instr->alu.add.op != V3D_QPU_A_NOP && + (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || +- instr->alu.add.a_unpack != V3D_QPU_UNPACK_NONE || +- instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE)) { ++ instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE || ++ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) { + return false; + } + break; +@@ -1242,8 +1242,8 @@ static bool + v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr, uint64_t *packed_instr) + { +- uint32_t mux_a = instr->alu.mul.a; +- uint32_t mux_b = instr->alu.mul.b; ++ uint32_t mux_a = instr->alu.mul.a.mux; ++ uint32_t mux_b = instr->alu.mul.b.mux; + int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op); + + const struct opcode_desc *desc = +@@ -1277,13 +1277,13 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, + */ + opcode += packed << 4; + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } + opcode |= packed << 2; + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack, + &packed)) { + return false; + } +@@ -1301,7 +1301,7 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, + opcode |= (packed >> 1) & 1; + mux_b = (packed & 1) << 2; + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } +@@ -1315,16 +1315,16 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, + if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE) + return false; + +- if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a_unpack, ++ if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } +- if (instr->alu.mul.a_unpack == V3D_QPU_UNPACK_SWAP_16) ++ if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16) + opcode = 8; + else + opcode |= (packed + 4) & 7; + +- if (instr->alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) ++ if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) + return false; + + break; +diff --git a/src/broadcom/qpu/tests/qpu_disasm.c b/src/broadcom/qpu/tests/qpu_disasm.c +index 2f8e19c73fe..be7b78d5ef0 100644 +--- a/src/broadcom/qpu/tests/qpu_disasm.c ++++ b/src/broadcom/qpu/tests/qpu_disasm.c +@@ -160,10 +160,10 @@ main(int argc, char **argv) + /* Swap the operands to be sure that we test + * how the QPUs distinguish between these ops. + */ +- swap_mux(&instr.alu.add.a, +- &instr.alu.add.b); +- swap_pack(&instr.alu.add.a_unpack, +- &instr.alu.add.b_unpack); ++ swap_mux(&instr.alu.add.a.mux, ++ &instr.alu.add.b.mux); ++ swap_pack(&instr.alu.add.a.unpack, ++ &instr.alu.add.b.unpack); + break; + default: + break; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0012-broadcom-qpu-add-raddr-on-v3d_qpu_input.patch b/projects/RPi/devices/RPi5/patches/mesa/0012-broadcom-qpu-add-raddr-on-v3d_qpu_input.patch new file mode 100644 index 0000000000..9c2303f4e4 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0012-broadcom-qpu-add-raddr-on-v3d_qpu_input.patch @@ -0,0 +1,45 @@ +From 52ea09792ff8a438ccdecac47b8415657be90098 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Fri, 6 Aug 2021 01:33:32 +0200 +Subject: [PATCH 012/142] broadcom/qpu: add raddr on v3d_qpu_input + +On V3D 7.x mux are not used, and raddr_a/b/c/d are used instead + +This is not perfect, as for v71, the raddr_a/b defined at qpu_instr +became superfluous. But the alternative would be to define two +different structs, or even having them defined based on version +ifdefs, so this is a reasonable compromise. +--- + src/broadcom/qpu/qpu_instr.h | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index 53a51bfb3e1..9e56e2d6a99 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -295,7 +295,10 @@ enum v3d_qpu_mux { + }; + + struct v3d_qpu_input { +- enum v3d_qpu_mux mux; ++ union { ++ enum v3d_qpu_mux mux; /* V3D 4.x */ ++ uint8_t raddr; /* V3D 7.x */ ++ }; + enum v3d_qpu_input_unpack unpack; + }; + +@@ -385,8 +388,8 @@ struct v3d_qpu_instr { + struct v3d_qpu_sig sig; + uint8_t sig_addr; + bool sig_magic; /* If the signal writes to a magic address */ +- uint8_t raddr_a; +- uint8_t raddr_b; ++ uint8_t raddr_a; /* V3D 4.x */ ++ uint8_t raddr_b; /* V3D 4.x*/ + struct v3d_qpu_flags flags; + + union { +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0013-broadcom-qpu-defining-shift-mask-for-raddr_c-d.patch b/projects/RPi/devices/RPi5/patches/mesa/0013-broadcom-qpu-defining-shift-mask-for-raddr_c-d.patch new file mode 100644 index 0000000000..162529e963 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0013-broadcom-qpu-defining-shift-mask-for-raddr_c-d.patch @@ -0,0 +1,37 @@ +From 3e5ad0881c2789619cdf65f40a44d5481e28e800 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 12 Aug 2021 02:24:02 +0200 +Subject: [PATCH 013/142] broadcom/qpu: defining shift/mask for raddr_c/d + +On V3D 7.x it replaces mul_a/b and add_a/b +--- + src/broadcom/qpu/qpu_pack.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c +index 6e975793fc0..4f106909729 100644 +--- a/src/broadcom/qpu/qpu_pack.c ++++ b/src/broadcom/qpu/qpu_pack.c +@@ -84,6 +84,9 @@ + #define V3D_QPU_MUL_A_SHIFT 18 + #define V3D_QPU_MUL_A_MASK QPU_MASK(20, 18) + ++#define V3D_QPU_RADDR_C_SHIFT 18 ++#define V3D_QPU_RADDR_C_MASK QPU_MASK(23, 18) ++ + #define V3D_QPU_ADD_B_SHIFT 15 + #define V3D_QPU_ADD_B_MASK QPU_MASK(17, 15) + +@@ -98,6 +101,9 @@ + #define V3D_QPU_BRANCH_BDI_SHIFT 12 + #define V3D_QPU_BRANCH_BDI_MASK QPU_MASK(13, 12) + ++#define V3D_QPU_RADDR_D_SHIFT 12 ++#define V3D_QPU_RADDR_D_MASK QPU_MASK(17, 12) ++ + #define V3D_QPU_RADDR_A_SHIFT 6 + #define V3D_QPU_RADDR_A_MASK QPU_MASK(11, 6) + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0014-broadcom-commmon-add-has_accumulators-field-on-v3d_d.patch b/projects/RPi/devices/RPi5/patches/mesa/0014-broadcom-commmon-add-has_accumulators-field-on-v3d_d.patch new file mode 100644 index 0000000000..1855816d95 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0014-broadcom-commmon-add-has_accumulators-field-on-v3d_d.patch @@ -0,0 +1,46 @@ +From 81febf14fe05ad26e992275b911e8bc1e1416ebc Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Fri, 17 Sep 2021 01:04:31 +0200 +Subject: [PATCH 014/142] broadcom/commmon: add has_accumulators field on + v3d_device_info + +Even if we can just check for the version on the code, checking for +this field makes several places more readable. So for example, on the +register allocate code we doesn't assign an accumulator because we +don't have accumulators on that hw, instead of because hw version is a +given one. +--- + src/broadcom/common/v3d_device_info.c | 2 ++ + src/broadcom/common/v3d_device_info.h | 3 +++ + 2 files changed, 5 insertions(+) + +diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c +index 7512fe3a06b..7bc2b662cfc 100644 +--- a/src/broadcom/common/v3d_device_info.c ++++ b/src/broadcom/common/v3d_device_info.c +@@ -65,6 +65,8 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i + int qups = (ident1.value >> 8) & 0xf; + devinfo->qpu_count = nslc * qups; + ++ devinfo->has_accumulators = devinfo->ver < 71; ++ + switch (devinfo->ver) { + case 33: + case 41: +diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h +index 32cb65cf81f..8dfc7858727 100644 +--- a/src/broadcom/common/v3d_device_info.h ++++ b/src/broadcom/common/v3d_device_info.h +@@ -42,6 +42,9 @@ struct v3d_device_info { + + /* NSLC * QUPS from the core's IDENT registers. */ + int qpu_count; ++ ++ /* If the hw has accumulator registers */ ++ bool has_accumulators; + }; + + typedef int (*v3d_ioctl_fun)(int fd, unsigned long request, void *arg); +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0015-broadcom-qpu-add-qpu_writes_rf0_implicitly-helper.patch b/projects/RPi/devices/RPi5/patches/mesa/0015-broadcom-qpu-add-qpu_writes_rf0_implicitly-helper.patch new file mode 100644 index 0000000000..8bd646ac94 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0015-broadcom-qpu-add-qpu_writes_rf0_implicitly-helper.patch @@ -0,0 +1,52 @@ +From 7d42eca87b6e144697810405308d99d200dca62a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 15 Sep 2021 10:56:43 +0200 +Subject: [PATCH 015/142] broadcom/qpu: add qpu_writes_rf0_implicitly helper + +On v71 rf0 replaces r5 as the register that gets updated implicitly +with uniform loads, and gets the C coefficient with ldvary. This +helper return if rf0 gets implicitly updated. +--- + src/broadcom/qpu/qpu_instr.c | 12 ++++++++++++ + src/broadcom/qpu/qpu_instr.h | 2 ++ + 2 files changed, 14 insertions(+) + +diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c +index 7ece8b5e570..8de99c611d5 100644 +--- a/src/broadcom/qpu/qpu_instr.c ++++ b/src/broadcom/qpu/qpu_instr.c +@@ -920,6 +920,18 @@ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo, + return false; + } + ++bool ++v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *inst) ++{ ++ if (devinfo->ver >= 71 && ++ (inst->sig.ldvary || inst->sig.ldunif || inst->sig.ldunifa)) { ++ return true; ++ } ++ ++ return false; ++} ++ + bool + v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) + { +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index 9e56e2d6a99..a25be8e0ee6 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -473,6 +473,8 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST; + bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST; ++bool v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST; + bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + bool v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0016-broadcom-qpu-add-pack-unpack-support-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0016-broadcom-qpu-add-pack-unpack-support-for-v71.patch new file mode 100644 index 0000000000..8afa579075 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0016-broadcom-qpu-add-pack-unpack-support-for-v71.patch @@ -0,0 +1,1258 @@ +From f0859613bd59e14fb21571e7978bb5c5d5e9c6d7 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Sat, 7 Aug 2021 02:20:39 +0200 +Subject: [PATCH 016/142] broadcom/qpu: add pack/unpack support for v71 + +Note that we provide new v71 alu pack/unpack methods. As there are a +lot that it is equivalent, initially we tried to use existing methods +as template and add version checks on the existing methods. At some +early point that become just really unreadable, so it become better to +just provide new methods, even if v42 and v71 methods have a really +similar structure. + +Note that we have splitted the op tables, and created a two (add/mul) +for v71. As the description struct include versioning info, we could +have just used one table. But, specially with the add table, there are +a lot of differences with v71. So it is slightly tidier this +way. Also, taking into account that we do a linear search on the +tables, this can be even justified by performance. +--- + src/broadcom/qpu/qpu_pack.c | 1049 ++++++++++++++++++++++++++++++----- + 1 file changed, 904 insertions(+), 145 deletions(-) + +diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c +index 4f106909729..4045275cb9a 100644 +--- a/src/broadcom/qpu/qpu_pack.c ++++ b/src/broadcom/qpu/qpu_pack.c +@@ -490,16 +490,26 @@ v3d_qpu_flags_pack(const struct v3d_device_info *devinfo, + + /* Make a mapping of the table of opcodes in the spec. The opcode is + * determined by a combination of the opcode field, and in the case of 0 or +- * 1-arg opcodes, the mux_b field as well. ++ * 1-arg opcodes, the mux (version <= 42) or raddr (version >= 71) field as ++ * well. + */ +-#define MUX_MASK(bot, top) (((1 << (top + 1)) - 1) - ((1 << (bot)) - 1)) +-#define ANYMUX MUX_MASK(0, 7) ++#define OP_MASK(val) BITFIELD64_BIT(val) ++#define OP_RANGE(bot, top) BITFIELD64_RANGE(bot, top - bot + 1) ++#define ANYMUX OP_RANGE(0, 7) ++#define ANYOPMASK OP_RANGE(0, 63) + + struct opcode_desc { + uint8_t opcode_first; + uint8_t opcode_last; +- uint8_t mux_b_mask; +- uint8_t mux_a_mask; ++ ++ union { ++ struct { ++ uint8_t b_mask; ++ uint8_t a_mask; ++ } mux; ++ uint64_t raddr_mask; ++ }; ++ + uint8_t op; + + /* first_ver == 0 if it's the same across all V3D versions. +@@ -512,122 +522,288 @@ struct opcode_desc { + uint8_t last_ver; + }; + +-static const struct opcode_desc add_ops[] = { ++static const struct opcode_desc add_ops_v33[] = { + /* FADD is FADDNF depending on the order of the mux_a/mux_b. */ +- { 0, 47, ANYMUX, ANYMUX, V3D_QPU_A_FADD }, +- { 0, 47, ANYMUX, ANYMUX, V3D_QPU_A_FADDNF }, +- { 53, 55, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK }, +- { 56, 56, ANYMUX, ANYMUX, V3D_QPU_A_ADD }, +- { 57, 59, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK }, +- { 60, 60, ANYMUX, ANYMUX, V3D_QPU_A_SUB }, +- { 61, 63, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK }, +- { 64, 111, ANYMUX, ANYMUX, V3D_QPU_A_FSUB }, +- { 120, 120, ANYMUX, ANYMUX, V3D_QPU_A_MIN }, +- { 121, 121, ANYMUX, ANYMUX, V3D_QPU_A_MAX }, +- { 122, 122, ANYMUX, ANYMUX, V3D_QPU_A_UMIN }, +- { 123, 123, ANYMUX, ANYMUX, V3D_QPU_A_UMAX }, +- { 124, 124, ANYMUX, ANYMUX, V3D_QPU_A_SHL }, +- { 125, 125, ANYMUX, ANYMUX, V3D_QPU_A_SHR }, +- { 126, 126, ANYMUX, ANYMUX, V3D_QPU_A_ASR }, +- { 127, 127, ANYMUX, ANYMUX, V3D_QPU_A_ROR }, ++ { 0, 47, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADD }, ++ { 0, 47, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADDNF }, ++ { 53, 55, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK }, ++ { 56, 56, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ADD }, ++ { 57, 59, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK }, ++ { 60, 60, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SUB }, ++ { 61, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK }, ++ { 64, 111, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FSUB }, ++ { 120, 120, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MIN }, ++ { 121, 121, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MAX }, ++ { 122, 122, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMIN }, ++ { 123, 123, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMAX }, ++ { 124, 124, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHL }, ++ { 125, 125, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHR }, ++ { 126, 126, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ASR }, ++ { 127, 127, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ROR }, + /* FMIN is instead FMAX depending on the order of the mux_a/mux_b. */ +- { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMIN }, +- { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMAX }, +- { 176, 180, ANYMUX, ANYMUX, V3D_QPU_A_VFMIN }, +- +- { 181, 181, ANYMUX, ANYMUX, V3D_QPU_A_AND }, +- { 182, 182, ANYMUX, ANYMUX, V3D_QPU_A_OR }, +- { 183, 183, ANYMUX, ANYMUX, V3D_QPU_A_XOR }, +- +- { 184, 184, ANYMUX, ANYMUX, V3D_QPU_A_VADD }, +- { 185, 185, ANYMUX, ANYMUX, V3D_QPU_A_VSUB }, +- { 186, 186, 1 << 0, ANYMUX, V3D_QPU_A_NOT }, +- { 186, 186, 1 << 1, ANYMUX, V3D_QPU_A_NEG }, +- { 186, 186, 1 << 2, ANYMUX, V3D_QPU_A_FLAPUSH }, +- { 186, 186, 1 << 3, ANYMUX, V3D_QPU_A_FLBPUSH }, +- { 186, 186, 1 << 4, ANYMUX, V3D_QPU_A_FLPOP }, +- { 186, 186, 1 << 5, ANYMUX, V3D_QPU_A_RECIP }, +- { 186, 186, 1 << 6, ANYMUX, V3D_QPU_A_SETMSF }, +- { 186, 186, 1 << 7, ANYMUX, V3D_QPU_A_SETREVF }, +- { 187, 187, 1 << 0, 1 << 0, V3D_QPU_A_NOP, 0 }, +- { 187, 187, 1 << 0, 1 << 1, V3D_QPU_A_TIDX }, +- { 187, 187, 1 << 0, 1 << 2, V3D_QPU_A_EIDX }, +- { 187, 187, 1 << 0, 1 << 3, V3D_QPU_A_LR }, +- { 187, 187, 1 << 0, 1 << 4, V3D_QPU_A_VFLA }, +- { 187, 187, 1 << 0, 1 << 5, V3D_QPU_A_VFLNA }, +- { 187, 187, 1 << 0, 1 << 6, V3D_QPU_A_VFLB }, +- { 187, 187, 1 << 0, 1 << 7, V3D_QPU_A_VFLNB }, +- +- { 187, 187, 1 << 1, MUX_MASK(0, 2), V3D_QPU_A_FXCD }, +- { 187, 187, 1 << 1, 1 << 3, V3D_QPU_A_XCD }, +- { 187, 187, 1 << 1, MUX_MASK(4, 6), V3D_QPU_A_FYCD }, +- { 187, 187, 1 << 1, 1 << 7, V3D_QPU_A_YCD }, +- +- { 187, 187, 1 << 2, 1 << 0, V3D_QPU_A_MSF }, +- { 187, 187, 1 << 2, 1 << 1, V3D_QPU_A_REVF }, +- { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_VDWWT, 33 }, +- { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_IID, 40 }, +- { 187, 187, 1 << 2, 1 << 3, V3D_QPU_A_SAMPID, 40 }, +- { 187, 187, 1 << 2, 1 << 4, V3D_QPU_A_BARRIERID, 40 }, +- { 187, 187, 1 << 2, 1 << 5, V3D_QPU_A_TMUWT }, +- { 187, 187, 1 << 2, 1 << 6, V3D_QPU_A_VPMWT }, +- { 187, 187, 1 << 2, 1 << 7, V3D_QPU_A_FLAFIRST, 41 }, +- { 187, 187, 1 << 3, 1 << 0, V3D_QPU_A_FLNAFIRST, 41 }, +- { 187, 187, 1 << 3, ANYMUX, V3D_QPU_A_VPMSETUP, 33 }, +- +- { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 }, +- { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 }, +- { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 }, +- { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 }, +- { 188, 188, 1 << 2, ANYMUX, V3D_QPU_A_LDVPMP, 40 }, +- { 188, 188, 1 << 3, ANYMUX, V3D_QPU_A_RSQRT, 41 }, +- { 188, 188, 1 << 4, ANYMUX, V3D_QPU_A_EXP, 41 }, +- { 188, 188, 1 << 5, ANYMUX, V3D_QPU_A_LOG, 41 }, +- { 188, 188, 1 << 6, ANYMUX, V3D_QPU_A_SIN, 41 }, +- { 188, 188, 1 << 7, ANYMUX, V3D_QPU_A_RSQRT2, 41 }, +- { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 }, +- { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 }, ++ { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMIN }, ++ { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMAX }, ++ { 176, 180, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMIN }, ++ ++ { 181, 181, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_AND }, ++ { 182, 182, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_OR }, ++ { 183, 183, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_XOR }, ++ ++ { 184, 184, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VADD }, ++ { 185, 185, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VSUB }, ++ { 186, 186, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_NOT }, ++ { 186, 186, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_NEG }, ++ { 186, 186, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_FLAPUSH }, ++ { 186, 186, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FLBPUSH }, ++ { 186, 186, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_FLPOP }, ++ { 186, 186, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_RECIP }, ++ { 186, 186, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SETMSF }, ++ { 186, 186, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_SETREVF }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(1), V3D_QPU_A_TIDX }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(2), V3D_QPU_A_EIDX }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(3), V3D_QPU_A_LR }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(4), V3D_QPU_A_VFLA }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(5), V3D_QPU_A_VFLNA }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VFLB }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(7), V3D_QPU_A_VFLNB }, ++ ++ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(0, 2), V3D_QPU_A_FXCD }, ++ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(3), V3D_QPU_A_XCD }, ++ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(4, 6), V3D_QPU_A_FYCD }, ++ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(7), V3D_QPU_A_YCD }, ++ ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(0), V3D_QPU_A_MSF }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(1), V3D_QPU_A_REVF }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_VDWWT, 33 }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_IID, 40 }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(3), V3D_QPU_A_SAMPID, 40 }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(4), V3D_QPU_A_BARRIERID, 40 }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(5), V3D_QPU_A_TMUWT }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VPMWT }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(7), V3D_QPU_A_FLAFIRST, 41 }, ++ { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = OP_MASK(0), V3D_QPU_A_FLNAFIRST, 41 }, ++ { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_VPMSETUP, 33 }, ++ ++ { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 }, ++ { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 }, ++ { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 }, ++ { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 }, ++ { 188, 188, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMP, 40 }, ++ { 188, 188, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT, 41 }, ++ { 188, 188, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_EXP, 41 }, ++ { 188, 188, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_LOG, 41 }, ++ { 188, 188, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SIN, 41 }, ++ { 188, 188, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT2, 41 }, ++ { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 }, ++ { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 }, + + /* FIXME: MORE COMPLICATED */ +- /* { 190, 191, ANYMUX, ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */ ++ /* { 190, 191, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */ + +- { 192, 239, ANYMUX, ANYMUX, V3D_QPU_A_FCMP }, +- { 240, 244, ANYMUX, ANYMUX, V3D_QPU_A_VFMAX }, ++ { 192, 239, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FCMP }, ++ { 240, 244, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMAX }, + +- { 245, 245, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FROUND }, +- { 245, 245, 1 << 3, ANYMUX, V3D_QPU_A_FTOIN }, +- { 245, 245, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FTRUNC }, +- { 245, 245, 1 << 7, ANYMUX, V3D_QPU_A_FTOIZ }, +- { 246, 246, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FFLOOR }, +- { 246, 246, 1 << 3, ANYMUX, V3D_QPU_A_FTOUZ }, +- { 246, 246, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FCEIL }, +- { 246, 246, 1 << 7, ANYMUX, V3D_QPU_A_FTOC }, ++ { 245, 245, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FROUND }, ++ { 245, 245, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIN }, ++ { 245, 245, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FTRUNC }, ++ { 245, 245, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIZ }, ++ { 246, 246, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FFLOOR }, ++ { 246, 246, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOUZ }, ++ { 246, 246, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FCEIL }, ++ { 246, 246, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOC }, + +- { 247, 247, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FDX }, +- { 247, 247, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FDY }, ++ { 247, 247, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FDX }, ++ { 247, 247, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FDY }, + + /* The stvpms are distinguished by the waddr field. */ +- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMV }, +- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMD }, +- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMP }, ++ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMV }, ++ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMD }, ++ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMP }, ++ ++ { 252, 252, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_ITOF }, ++ { 252, 252, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_CLZ }, ++ { 252, 252, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_UTOF }, ++}; ++ ++static const struct opcode_desc mul_ops_v33[] = { ++ { 1, 1, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_ADD }, ++ { 2, 2, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SUB }, ++ { 3, 3, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_UMUL24 }, ++ { 4, 8, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_VFMUL }, ++ { 9, 9, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SMUL24 }, ++ { 10, 10, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_MULTOP }, ++ { 14, 14, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMOV, 33, 42 }, ++ { 15, 15, .mux.b_mask = OP_RANGE(0, 3), ANYMUX, V3D_QPU_M_FMOV, 33, 42}, ++ { 15, 15, .mux.b_mask = OP_MASK(4), .mux.a_mask = OP_MASK(0), V3D_QPU_M_NOP, 33, 42 }, ++ { 15, 15, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_M_MOV, 33, 42 }, ++ ++ { 16, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMUL }, ++}; ++ ++/* Note that it would have been possible to define all the add/mul opcodes in ++ * just one table, using the first_ver/last_ver. But taking into account that ++ * for v71 there were a lot of changes, it was more tidy this way. Also right ++ * now we are doing a linear search on those tables, so this maintains the ++ * tables smaller. ++ * ++ * Just in case we merge the tables, we define the first_ver as 71 for those ++ * opcodes that changed on v71 ++ */ ++static const struct opcode_desc add_ops_v71[] = { ++ { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD }, ++ { 53, 55, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK }, ++ { 56, 56, .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD }, ++ { 57, 59, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK }, ++ { 60, 60, .raddr_mask = ANYOPMASK, V3D_QPU_A_SUB }, ++ { 61, 63, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK }, ++ { 64, 111, .raddr_mask = ANYOPMASK, V3D_QPU_A_FSUB }, ++ { 120, 120, .raddr_mask = ANYOPMASK, V3D_QPU_A_MIN }, ++ { 121, 121, .raddr_mask = ANYOPMASK, V3D_QPU_A_MAX }, ++ { 122, 122, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMIN }, ++ { 123, 123, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMAX }, ++ { 124, 124, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHL }, ++ { 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR }, ++ { 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR }, ++ { 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR }, ++ ++ { 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND }, ++ { 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR }, ++ { 183, 183, .raddr_mask = ANYOPMASK, V3D_QPU_A_XOR }, ++ { 184, 184, .raddr_mask = ANYOPMASK, V3D_QPU_A_VADD }, ++ { 185, 185, .raddr_mask = ANYOPMASK, V3D_QPU_A_VSUB }, ++ ++ { 186, 186, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOT }, ++ { 186, 186, .raddr_mask = OP_MASK(1), V3D_QPU_A_NEG }, ++ { 186, 186, .raddr_mask = OP_MASK(2), V3D_QPU_A_FLAPUSH }, ++ { 186, 186, .raddr_mask = OP_MASK(3), V3D_QPU_A_FLBPUSH }, ++ { 186, 186, .raddr_mask = OP_MASK(4), V3D_QPU_A_FLPOP }, ++ { 186, 186, .raddr_mask = OP_MASK(5), V3D_QPU_A_CLZ }, ++ { 186, 186, .raddr_mask = OP_MASK(6), V3D_QPU_A_SETMSF }, ++ { 186, 186, .raddr_mask = OP_MASK(7), V3D_QPU_A_SETREVF }, ++ ++ { 187, 187, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 }, ++ { 187, 187, .raddr_mask = OP_MASK(1), V3D_QPU_A_TIDX }, ++ { 187, 187, .raddr_mask = OP_MASK(2), V3D_QPU_A_EIDX }, ++ { 187, 187, .raddr_mask = OP_MASK(3), V3D_QPU_A_LR }, ++ { 187, 187, .raddr_mask = OP_MASK(4), V3D_QPU_A_VFLA }, ++ { 187, 187, .raddr_mask = OP_MASK(5), V3D_QPU_A_VFLNA }, ++ { 187, 187, .raddr_mask = OP_MASK(6), V3D_QPU_A_VFLB }, ++ { 187, 187, .raddr_mask = OP_MASK(7), V3D_QPU_A_VFLNB }, ++ { 187, 187, .raddr_mask = OP_MASK(8), V3D_QPU_A_XCD }, ++ { 187, 187, .raddr_mask = OP_MASK(9), V3D_QPU_A_YCD }, ++ { 187, 187, .raddr_mask = OP_MASK(10), V3D_QPU_A_MSF }, ++ { 187, 187, .raddr_mask = OP_MASK(11), V3D_QPU_A_REVF }, ++ { 187, 187, .raddr_mask = OP_MASK(12), V3D_QPU_A_IID }, ++ { 187, 187, .raddr_mask = OP_MASK(13), V3D_QPU_A_SAMPID }, ++ { 187, 187, .raddr_mask = OP_MASK(14), V3D_QPU_A_BARRIERID }, ++ { 187, 187, .raddr_mask = OP_MASK(15), V3D_QPU_A_TMUWT }, ++ { 187, 187, .raddr_mask = OP_MASK(16), V3D_QPU_A_VPMWT }, ++ { 187, 187, .raddr_mask = OP_MASK(17), V3D_QPU_A_FLAFIRST }, ++ { 187, 187, .raddr_mask = OP_MASK(18), V3D_QPU_A_FLNAFIRST }, ++ ++ { 187, 187, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FXCD }, ++ { 187, 187, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FYCD }, ++ ++ { 188, 188, .raddr_mask = OP_MASK(0), V3D_QPU_A_LDVPMV_IN, 71 }, ++ { 188, 188, .raddr_mask = OP_MASK(1), V3D_QPU_A_LDVPMD_IN, 71 }, ++ { 188, 188, .raddr_mask = OP_MASK(2), V3D_QPU_A_LDVPMP, 71 }, ++ ++ { 188, 188, .raddr_mask = OP_MASK(32), V3D_QPU_A_RECIP, 71 }, ++ { 188, 188, .raddr_mask = OP_MASK(33), V3D_QPU_A_RSQRT, 71 }, ++ { 188, 188, .raddr_mask = OP_MASK(34), V3D_QPU_A_EXP, 71 }, ++ { 188, 188, .raddr_mask = OP_MASK(35), V3D_QPU_A_LOG, 71 }, ++ { 188, 188, .raddr_mask = OP_MASK(36), V3D_QPU_A_SIN, 71 }, ++ { 188, 188, .raddr_mask = OP_MASK(37), V3D_QPU_A_RSQRT2, 71 }, ++ ++ { 189, 189, .raddr_mask = ANYOPMASK, V3D_QPU_A_LDVPMG_IN, 71 }, + +- { 252, 252, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_ITOF }, +- { 252, 252, 1 << 3, ANYMUX, V3D_QPU_A_CLZ }, +- { 252, 252, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_UTOF }, ++ /* The stvpms are distinguished by the waddr field. */ ++ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMV, 71}, ++ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMD, 71}, ++ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMP, 71}, ++ ++ { 192, 207, .raddr_mask = ANYOPMASK, V3D_QPU_A_FCMP, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FROUND, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FROUND, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FROUND, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FROUND, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_MASK(3), V3D_QPU_A_FTOIN, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(7), V3D_QPU_A_FTOIN, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(11), V3D_QPU_A_FTOIN, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(15), V3D_QPU_A_FTOIN, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FTRUNC, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FTRUNC, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FTRUNC, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FTRUNC, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_MASK(19), V3D_QPU_A_FTOIZ, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(23), V3D_QPU_A_FTOIZ, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(27), V3D_QPU_A_FTOIZ, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(31), V3D_QPU_A_FTOIZ, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FFLOOR, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FFLOOR, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(40, 42), V3D_QPU_A_FFLOOR, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(44, 46), V3D_QPU_A_FFLOOR, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_MASK(35), V3D_QPU_A_FTOUZ, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(39), V3D_QPU_A_FTOUZ, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(43), V3D_QPU_A_FTOUZ, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(47), V3D_QPU_A_FTOUZ, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_RANGE(48, 50), V3D_QPU_A_FCEIL, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(52, 54), V3D_QPU_A_FCEIL, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(56, 58), V3D_QPU_A_FCEIL, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(60, 62), V3D_QPU_A_FCEIL, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_MASK(51), V3D_QPU_A_FTOC }, ++ { 245, 245, .raddr_mask = OP_MASK(55), V3D_QPU_A_FTOC }, ++ { 245, 245, .raddr_mask = OP_MASK(59), V3D_QPU_A_FTOC }, ++ { 245, 245, .raddr_mask = OP_MASK(63), V3D_QPU_A_FTOC }, ++ ++ { 246, 246, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FDX, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FDX, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FDX, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FDX, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FDY, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FDY, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FDY, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FDY, 71 }, ++ ++ { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 }, + }; + +-static const struct opcode_desc mul_ops[] = { +- { 1, 1, ANYMUX, ANYMUX, V3D_QPU_M_ADD }, +- { 2, 2, ANYMUX, ANYMUX, V3D_QPU_M_SUB }, +- { 3, 3, ANYMUX, ANYMUX, V3D_QPU_M_UMUL24 }, +- { 4, 8, ANYMUX, ANYMUX, V3D_QPU_M_VFMUL }, +- { 9, 9, ANYMUX, ANYMUX, V3D_QPU_M_SMUL24 }, +- { 10, 10, ANYMUX, ANYMUX, V3D_QPU_M_MULTOP }, +- { 14, 14, ANYMUX, ANYMUX, V3D_QPU_M_FMOV }, +- { 15, 15, MUX_MASK(0, 3), ANYMUX, V3D_QPU_M_FMOV }, +- { 15, 15, 1 << 4, 1 << 0, V3D_QPU_M_NOP, 0 }, +- { 15, 15, 1 << 7, ANYMUX, V3D_QPU_M_MOV }, +- { 16, 63, ANYMUX, ANYMUX, V3D_QPU_M_FMUL }, ++static const struct opcode_desc mul_ops_v71[] = { ++ /* For V3D 7.1, second mask field would be ignored */ ++ { 1, 1, .raddr_mask = ANYOPMASK, V3D_QPU_M_ADD, 71 }, ++ { 2, 2, .raddr_mask = ANYOPMASK, V3D_QPU_M_SUB, 71 }, ++ { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 }, ++ { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 }, ++ { 4, 8, .raddr_mask = ANYOPMASK, V3D_QPU_M_VFMUL, 71 }, ++ { 9, 9, .raddr_mask = ANYOPMASK, V3D_QPU_M_SMUL24, 71 }, ++ { 10, 10, .raddr_mask = ANYOPMASK, V3D_QPU_M_MULTOP, 71 }, ++ ++ { 14, 14, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_M_FMOV, 71 }, ++ { 14, 14, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_M_FMOV, 71 }, ++ { 14, 14, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_M_FMOV, 71 }, ++ { 14, 14, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_M_FMOV, 71 }, ++ { 14, 14, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_M_FMOV, 71 }, ++ { 14, 14, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_M_FMOV, 71 }, ++ ++ { 14, 14, .raddr_mask = OP_MASK(3), V3D_QPU_M_MOV, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(7), V3D_QPU_M_MOV, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(11), V3D_QPU_M_MOV, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 }, ++ ++ { 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 }, ++ ++ { 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL }, + }; + + /* Returns true if op_desc should be filtered out based on devinfo->ver +@@ -636,17 +812,23 @@ static const struct opcode_desc mul_ops[] = { + */ + static bool + opcode_invalid_in_version(const struct v3d_device_info *devinfo, +- const struct opcode_desc *op_desc) ++ const uint8_t first_ver, ++ const uint8_t last_ver) + { +- return (op_desc->first_ver != 0 && devinfo->ver < op_desc->first_ver) || +- (op_desc->last_ver != 0 && devinfo->ver > op_desc->last_ver); ++ return (first_ver != 0 && devinfo->ver < first_ver) || ++ (last_ver != 0 && devinfo->ver > last_ver); + } + ++/* Note that we pass as parameters mux_a, mux_b and raddr, even if depending ++ * on the devinfo->ver some would be ignored. We do this way just to avoid ++ * having two really similar lookup_opcode methods ++ */ + static const struct opcode_desc * + lookup_opcode_from_packed(const struct v3d_device_info *devinfo, + const struct opcode_desc *opcodes, + size_t num_opcodes, uint32_t opcode, +- uint32_t mux_a, uint32_t mux_b) ++ uint32_t mux_a, uint32_t mux_b, ++ uint32_t raddr) + { + for (int i = 0; i < num_opcodes; i++) { + const struct opcode_desc *op_desc = &opcodes[i]; +@@ -655,14 +837,19 @@ lookup_opcode_from_packed(const struct v3d_device_info *devinfo, + opcode > op_desc->opcode_last) + continue; + +- if (opcode_invalid_in_version(devinfo, op_desc)) ++ if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver)) + continue; + +- if (!(op_desc->mux_b_mask & (1 << mux_b))) +- continue; ++ if (devinfo->ver < 71) { ++ if (!(op_desc->mux.b_mask & (1 << mux_b))) ++ continue; + +- if (!(op_desc->mux_a_mask & (1 << mux_a))) +- continue; ++ if (!(op_desc->mux.a_mask & (1 << mux_a))) ++ continue; ++ } else { ++ if (!(op_desc->raddr_mask & ((uint64_t) 1 << raddr))) ++ continue; ++ } + + return op_desc; + } +@@ -784,8 +971,8 @@ v3d_qpu_float32_pack_pack(enum v3d_qpu_output_pack pack, + } + + static bool +-v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, +- struct v3d_qpu_instr *instr) ++v3d33_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, ++ struct v3d_qpu_instr *instr) + { + uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD); + uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_ADD_A); +@@ -802,8 +989,9 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + map_op = (map_op - 253 + 245); + + const struct opcode_desc *desc = +- lookup_opcode_from_packed(devinfo, add_ops, ARRAY_SIZE(add_ops), +- map_op, mux_a, mux_b); ++ lookup_opcode_from_packed(devinfo, add_ops_v33, ++ ARRAY_SIZE(add_ops_v33), ++ map_op, mux_a, mux_b, 0); + + if (!desc) + return false; +@@ -939,8 +1127,160 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + } + + static bool +-v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, ++v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, ++ struct v3d_qpu_instr *instr) ++{ ++ uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD); ++ uint32_t raddr_a = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_A); ++ uint32_t raddr_b = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_B); ++ uint32_t waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A); ++ uint32_t map_op = op; ++ ++ const struct opcode_desc *desc = ++ lookup_opcode_from_packed(devinfo, ++ add_ops_v71, ++ ARRAY_SIZE(add_ops_v71), ++ map_op, 0, 0, ++ raddr_b); ++ if (!desc) ++ return false; ++ ++ instr->alu.add.op = desc->op; ++ ++ /* Some QPU ops require a bit more than just basic opcode and mux a/b ++ * comparisons to distinguish them. ++ */ ++ switch (instr->alu.add.op) { ++ case V3D_QPU_A_STVPMV: ++ case V3D_QPU_A_STVPMD: ++ case V3D_QPU_A_STVPMP: ++ switch (waddr) { ++ case 0: ++ instr->alu.add.op = V3D_QPU_A_STVPMV; ++ break; ++ case 1: ++ instr->alu.add.op = V3D_QPU_A_STVPMD; ++ break; ++ case 2: ++ instr->alu.add.op = V3D_QPU_A_STVPMP; ++ break; ++ default: ++ return false; ++ } ++ break; ++ default: ++ break; ++ } ++ ++ switch (instr->alu.add.op) { ++ case V3D_QPU_A_FADD: ++ case V3D_QPU_A_FADDNF: ++ case V3D_QPU_A_FSUB: ++ case V3D_QPU_A_FMIN: ++ case V3D_QPU_A_FMAX: ++ case V3D_QPU_A_FCMP: ++ case V3D_QPU_A_VFPACK: ++ if (instr->alu.add.op != V3D_QPU_A_VFPACK && ++ instr->alu.add.op != V3D_QPU_A_FCMP) { ++ instr->alu.add.output_pack = (op >> 4) & 0x3; ++ } else { ++ instr->alu.add.output_pack = V3D_QPU_PACK_NONE; ++ } ++ ++ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, ++ &instr->alu.add.a.unpack)) { ++ return false; ++ } ++ ++ if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, ++ &instr->alu.add.b.unpack)) { ++ return false; ++ } ++ break; ++ ++ case V3D_QPU_A_FFLOOR: ++ case V3D_QPU_A_FROUND: ++ case V3D_QPU_A_FTRUNC: ++ case V3D_QPU_A_FCEIL: ++ case V3D_QPU_A_FDX: ++ case V3D_QPU_A_FDY: ++ instr->alu.add.output_pack = raddr_b & 0x3; ++ ++ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, ++ &instr->alu.add.a.unpack)) { ++ return false; ++ } ++ break; ++ ++ case V3D_QPU_A_FTOIN: ++ case V3D_QPU_A_FTOIZ: ++ case V3D_QPU_A_FTOUZ: ++ case V3D_QPU_A_FTOC: ++ instr->alu.add.output_pack = V3D_QPU_PACK_NONE; ++ ++ if (!v3d_qpu_float32_unpack_unpack((raddr_b >> 2) & 0x3, ++ &instr->alu.add.a.unpack)) { ++ return false; ++ } ++ break; ++ ++ case V3D_QPU_A_VFMIN: ++ case V3D_QPU_A_VFMAX: ++ unreachable("pending v71 update"); ++ if (!v3d_qpu_float16_unpack_unpack(op & 0x7, ++ &instr->alu.add.a.unpack)) { ++ return false; ++ } ++ ++ instr->alu.add.output_pack = V3D_QPU_PACK_NONE; ++ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; ++ break; ++ ++ default: ++ instr->alu.add.output_pack = V3D_QPU_PACK_NONE; ++ instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; ++ break; ++ } ++ ++ instr->alu.add.a.raddr = raddr_a; ++ instr->alu.add.b.raddr = raddr_b; ++ instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A); ++ ++ instr->alu.add.magic_write = false; ++ if (packed_inst & V3D_QPU_MA) { ++ switch (instr->alu.add.op) { ++ case V3D_QPU_A_LDVPMV_IN: ++ instr->alu.add.op = V3D_QPU_A_LDVPMV_OUT; ++ break; ++ case V3D_QPU_A_LDVPMD_IN: ++ instr->alu.add.op = V3D_QPU_A_LDVPMD_OUT; ++ break; ++ case V3D_QPU_A_LDVPMG_IN: ++ instr->alu.add.op = V3D_QPU_A_LDVPMG_OUT; ++ break; ++ default: ++ instr->alu.add.magic_write = true; ++ break; ++ } ++ } ++ ++ return true; ++} ++ ++static bool ++v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + struct v3d_qpu_instr *instr) ++{ ++ if (devinfo->ver < 71) ++ return v3d33_qpu_add_unpack(devinfo, packed_inst, instr); ++ else ++ return v3d71_qpu_add_unpack(devinfo, packed_inst, instr); ++} ++ ++static bool ++v3d33_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, ++ struct v3d_qpu_instr *instr) + { + uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL); + uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_A); +@@ -948,9 +1288,10 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + + { + const struct opcode_desc *desc = +- lookup_opcode_from_packed(devinfo, mul_ops, +- ARRAY_SIZE(mul_ops), +- op, mux_a, mux_b); ++ lookup_opcode_from_packed(devinfo, ++ mul_ops_v33, ++ ARRAY_SIZE(mul_ops_v33), ++ op, mux_a, mux_b, 0); + if (!desc) + return false; + +@@ -1011,6 +1352,91 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + return true; + } + ++static bool ++v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, ++ struct v3d_qpu_instr *instr) ++{ ++ uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL); ++ uint32_t raddr_c = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_C); ++ uint32_t raddr_d = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_D); ++ ++ { ++ const struct opcode_desc *desc = ++ lookup_opcode_from_packed(devinfo, ++ mul_ops_v71, ++ ARRAY_SIZE(mul_ops_v71), ++ op, 0, 0, ++ raddr_d); ++ if (!desc) ++ return false; ++ ++ instr->alu.mul.op = desc->op; ++ } ++ ++ switch (instr->alu.mul.op) { ++ case V3D_QPU_M_FMUL: ++ instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1; ++ ++ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, ++ &instr->alu.mul.a.unpack)) { ++ return false; ++ } ++ ++ if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, ++ &instr->alu.mul.b.unpack)) { ++ return false; ++ } ++ ++ break; ++ ++ case V3D_QPU_M_FMOV: ++ instr->alu.mul.output_pack = (raddr_d >> 2) & 1; ++ ++ if (!v3d_qpu_float32_unpack_unpack(raddr_d & 0x3, ++ &instr->alu.mul.a.unpack)) { ++ return false; ++ } ++ ++ break; ++ ++ case V3D_QPU_M_VFMUL: ++ unreachable("pending v71 update"); ++ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; ++ ++ if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7, ++ &instr->alu.mul.a.unpack)) { ++ return false; ++ } ++ ++ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; ++ ++ break; ++ ++ default: ++ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; ++ instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; ++ break; ++ } ++ ++ instr->alu.mul.a.raddr = raddr_c; ++ instr->alu.mul.b.raddr = raddr_d; ++ instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M); ++ instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM; ++ ++ return true; ++} ++ ++static bool ++v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, ++ struct v3d_qpu_instr *instr) ++{ ++ if (devinfo->ver < 71) ++ return v3d33_qpu_mul_unpack(devinfo, packed_inst, instr); ++ else ++ return v3d71_qpu_mul_unpack(devinfo, packed_inst, instr); ++} ++ + static const struct opcode_desc * + lookup_opcode_from_instr(const struct v3d_device_info *devinfo, + const struct opcode_desc *opcodes, size_t num_opcodes, +@@ -1022,7 +1448,7 @@ lookup_opcode_from_instr(const struct v3d_device_info *devinfo, + if (op_desc->op != op) + continue; + +- if (opcode_invalid_in_version(devinfo, op_desc)) ++ if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver)) + continue; + + return op_desc; +@@ -1032,30 +1458,31 @@ lookup_opcode_from_instr(const struct v3d_device_info *devinfo, + } + + static bool +-v3d_qpu_add_pack(const struct v3d_device_info *devinfo, +- const struct v3d_qpu_instr *instr, uint64_t *packed_instr) ++v3d33_qpu_add_pack(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *instr, uint64_t *packed_instr) + { + uint32_t waddr = instr->alu.add.waddr; + uint32_t mux_a = instr->alu.add.a.mux; + uint32_t mux_b = instr->alu.add.b.mux; + int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op); + const struct opcode_desc *desc = +- lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops), ++ lookup_opcode_from_instr(devinfo, add_ops_v33, ++ ARRAY_SIZE(add_ops_v33), + instr->alu.add.op); + + if (!desc) + return false; + +- uint32_t opcode = desc->opcode_first; ++ uint32_t opcode = opcode = desc->opcode_first; + + /* If an operation doesn't use an arg, its mux values may be used to + * identify the operation type. + */ + if (nsrc < 2) +- mux_b = ffs(desc->mux_b_mask) - 1; ++ mux_b = ffs(desc->mux.b_mask) - 1; + + if (nsrc < 1) +- mux_a = ffs(desc->mux_a_mask) - 1; ++ mux_a = ffs(desc->mux.a_mask) - 1; + + bool no_magic_write = false; + +@@ -1162,8 +1589,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + return false; + } + +- opcode = (opcode & ~(1 << 2)) | (a_unpack << 2); +- opcode = (opcode & ~(1 << 0)) | (b_unpack << 0); ++ opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2); ++ opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0); + + break; + } +@@ -1188,7 +1615,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + } + if (packed == 0) + return false; +- opcode = (opcode & ~(1 << 2)) | packed << 2; ++ opcode = (opcode & ~(0x3 << 2)) | packed << 2; + break; + } + +@@ -1245,15 +1672,211 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + } + + static bool +-v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, +- const struct v3d_qpu_instr *instr, uint64_t *packed_instr) ++v3d71_qpu_add_pack(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *instr, uint64_t *packed_instr) ++{ ++ uint32_t waddr = instr->alu.add.waddr; ++ uint32_t raddr_a = instr->alu.add.a.raddr; ++ uint32_t raddr_b = instr->alu.add.b.raddr; ++ ++ int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op); ++ const struct opcode_desc *desc = ++ lookup_opcode_from_instr(devinfo, add_ops_v71, ++ ARRAY_SIZE(add_ops_v71), ++ instr->alu.add.op); ++ if (!desc) ++ return false; ++ ++ uint32_t opcode = opcode = desc->opcode_first; ++ ++ /* If an operation doesn't use an arg, its raddr values may be used to ++ * identify the operation type. ++ */ ++ if (nsrc < 2) ++ raddr_b = ffsll(desc->raddr_mask) - 1; ++ ++ bool no_magic_write = false; ++ ++ switch (instr->alu.add.op) { ++ case V3D_QPU_A_STVPMV: ++ waddr = 0; ++ no_magic_write = true; ++ break; ++ case V3D_QPU_A_STVPMD: ++ waddr = 1; ++ no_magic_write = true; ++ break; ++ case V3D_QPU_A_STVPMP: ++ waddr = 2; ++ no_magic_write = true; ++ break; ++ ++ case V3D_QPU_A_LDVPMV_IN: ++ case V3D_QPU_A_LDVPMD_IN: ++ case V3D_QPU_A_LDVPMP: ++ case V3D_QPU_A_LDVPMG_IN: ++ assert(!instr->alu.add.magic_write); ++ break; ++ ++ case V3D_QPU_A_LDVPMV_OUT: ++ case V3D_QPU_A_LDVPMD_OUT: ++ case V3D_QPU_A_LDVPMG_OUT: ++ assert(!instr->alu.add.magic_write); ++ *packed_instr |= V3D_QPU_MA; ++ break; ++ ++ default: ++ break; ++ } ++ ++ switch (instr->alu.add.op) { ++ case V3D_QPU_A_FADD: ++ case V3D_QPU_A_FADDNF: ++ case V3D_QPU_A_FSUB: ++ case V3D_QPU_A_FMIN: ++ case V3D_QPU_A_FMAX: ++ case V3D_QPU_A_FCMP: { ++ uint32_t output_pack; ++ uint32_t a_unpack; ++ uint32_t b_unpack; ++ ++ if (instr->alu.add.op != V3D_QPU_A_FCMP) { ++ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack, ++ &output_pack)) { ++ return false; ++ } ++ opcode |= output_pack << 4; ++ } ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, ++ &a_unpack)) { ++ return false; ++ } ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, ++ &b_unpack)) { ++ return false; ++ } ++ ++ opcode |= a_unpack << 2; ++ opcode |= b_unpack << 0; ++ ++ break; ++ } ++ ++ case V3D_QPU_A_VFPACK: { ++ uint32_t a_unpack; ++ uint32_t b_unpack; ++ ++ if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS || ++ instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) { ++ return false; ++ } ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, ++ &a_unpack)) { ++ return false; ++ } ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, ++ &b_unpack)) { ++ return false; ++ } ++ ++ opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2); ++ opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0); ++ ++ break; ++ } ++ ++ case V3D_QPU_A_FFLOOR: ++ case V3D_QPU_A_FROUND: ++ case V3D_QPU_A_FTRUNC: ++ case V3D_QPU_A_FCEIL: ++ case V3D_QPU_A_FDX: ++ case V3D_QPU_A_FDY: { ++ uint32_t packed; ++ ++ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack, ++ &packed)) { ++ return false; ++ } ++ raddr_b |= packed; ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, ++ &packed)) { ++ return false; ++ } ++ if (packed == 0) ++ return false; ++ raddr_b = (raddr_b & ~(0x3 << 2)) | packed << 2; ++ break; ++ } ++ ++ case V3D_QPU_A_FTOIN: ++ case V3D_QPU_A_FTOIZ: ++ case V3D_QPU_A_FTOUZ: ++ case V3D_QPU_A_FTOC: ++ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE) ++ return false; ++ ++ uint32_t packed; ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, ++ &packed)) { ++ return false; ++ } ++ if (packed == 0) ++ return false; ++ ++ raddr_b |= (raddr_b & ~(0x3 << 2)) | packed << 2; ++ ++ break; ++ ++ case V3D_QPU_A_VFMIN: ++ case V3D_QPU_A_VFMAX: ++ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || ++ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) { ++ return false; ++ } ++ ++ if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack, ++ &packed)) { ++ return false; ++ } ++ opcode |= packed; ++ break; ++ ++ default: ++ if (instr->alu.add.op != V3D_QPU_A_NOP && ++ (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || ++ instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE || ++ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) { ++ return false; ++ } ++ break; ++ } ++ ++ *packed_instr |= QPU_SET_FIELD(raddr_a, V3D_QPU_RADDR_A); ++ *packed_instr |= QPU_SET_FIELD(raddr_b, V3D_QPU_RADDR_B); ++ *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_ADD); ++ *packed_instr |= QPU_SET_FIELD(waddr, V3D_QPU_WADDR_A); ++ if (instr->alu.add.magic_write && !no_magic_write) ++ *packed_instr |= V3D_QPU_MA; ++ ++ return true; ++} ++ ++static bool ++v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *instr, uint64_t *packed_instr) + { + uint32_t mux_a = instr->alu.mul.a.mux; + uint32_t mux_b = instr->alu.mul.b.mux; + int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op); + + const struct opcode_desc *desc = +- lookup_opcode_from_instr(devinfo, mul_ops, ARRAY_SIZE(mul_ops), ++ lookup_opcode_from_instr(devinfo, mul_ops_v33, ++ ARRAY_SIZE(mul_ops_v33), + instr->alu.mul.op); + + if (!desc) +@@ -1265,10 +1888,10 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, + * that here. If mux a/b determine packing, it will be set below. + */ + if (nsrc < 2) +- mux_b = ffs(desc->mux_b_mask) - 1; ++ mux_b = ffs(desc->mux.b_mask) - 1; + + if (nsrc < 1) +- mux_a = ffs(desc->mux_a_mask) - 1; ++ mux_a = ffs(desc->mux.a_mask) - 1; + + switch (instr->alu.mul.op) { + case V3D_QPU_M_FMUL: { +@@ -1351,6 +1974,130 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, + return true; + } + ++static bool ++v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *instr, uint64_t *packed_instr) ++{ ++ uint32_t raddr_c = instr->alu.mul.a.raddr; ++ uint32_t raddr_d = instr->alu.mul.b.raddr; ++ int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op); ++ ++ const struct opcode_desc *desc = ++ lookup_opcode_from_instr(devinfo, mul_ops_v71, ++ ARRAY_SIZE(mul_ops_v71), ++ instr->alu.mul.op); ++ if (!desc) ++ return false; ++ ++ uint32_t opcode = desc->opcode_first; ++ ++ /* Some opcodes have a single valid value for their raddr_d, so set ++ * that here. If raddr_b determine packing, it will be set below. ++ */ ++ if (nsrc < 2) ++ raddr_d = ffsll(desc->raddr_mask) - 1; ++ ++ switch (instr->alu.mul.op) { ++ case V3D_QPU_M_FMUL: { ++ uint32_t packed; ++ ++ if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack, ++ &packed)) { ++ return false; ++ } ++ /* No need for a +1 because desc->opcode_first has a 1 in this ++ * field. ++ */ ++ opcode += packed << 4; ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, ++ &packed)) { ++ return false; ++ } ++ opcode |= packed << 2; ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack, ++ &packed)) { ++ return false; ++ } ++ opcode |= packed << 0; ++ break; ++ } ++ ++ case V3D_QPU_M_FMOV: { ++ uint32_t packed; ++ ++ if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack, ++ &packed)) { ++ return false; ++ } ++ opcode |= (packed >> 1) & 1; ++ raddr_d = (packed & 1) << 2; ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, ++ &packed)) { ++ return false; ++ } ++ raddr_d |= packed; ++ break; ++ } ++ ++ case V3D_QPU_M_VFMUL: { ++ unreachable("pending v71 update"); ++ uint32_t packed; ++ ++ if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE) ++ return false; ++ ++ if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack, ++ &packed)) { ++ return false; ++ } ++ if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16) ++ opcode = 8; ++ else ++ opcode |= (packed + 4) & 7; ++ ++ if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) ++ return false; ++ ++ break; ++ } ++ ++ default: ++ break; ++ } ++ ++ *packed_instr |= QPU_SET_FIELD(raddr_c, V3D_QPU_RADDR_C); ++ *packed_instr |= QPU_SET_FIELD(raddr_d, V3D_QPU_RADDR_D); ++ *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_MUL); ++ *packed_instr |= QPU_SET_FIELD(instr->alu.mul.waddr, V3D_QPU_WADDR_M); ++ if (instr->alu.mul.magic_write) ++ *packed_instr |= V3D_QPU_MM; ++ ++ return true; ++} ++ ++static bool ++v3d_qpu_add_pack(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *instr, uint64_t *packed_instr) ++{ ++ if (devinfo->ver < 71) ++ return v3d33_qpu_add_pack(devinfo, instr, packed_instr); ++ else ++ return v3d71_qpu_add_pack(devinfo, instr, packed_instr); ++} ++ ++static bool ++v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *instr, uint64_t *packed_instr) ++{ ++ if (devinfo->ver < 71) ++ return v3d33_qpu_mul_pack(devinfo, instr, packed_instr); ++ else ++ return v3d71_qpu_mul_pack(devinfo, instr, packed_instr); ++} ++ + static bool + v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo, + uint64_t packed_instr, +@@ -1379,8 +2126,14 @@ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo, + return false; + } + +- instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A); +- instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B); ++ if (devinfo->ver <= 71) { ++ /* ++ * For v71 this will be set on add/mul unpack, as raddr are now ++ * part of v3d_qpu_input ++ */ ++ instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A); ++ instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B); ++ } + + if (!v3d_qpu_add_unpack(devinfo, packed_instr, instr)) + return false; +@@ -1466,8 +2219,14 @@ v3d_qpu_instr_pack_alu(const struct v3d_device_info *devinfo, + *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG); + + if (instr->type == V3D_QPU_INSTR_TYPE_ALU) { +- *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A); +- *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B); ++ if (devinfo->ver < 71) { ++ /* ++ * For v71 this will be set on add/mul unpack, as raddr are now ++ * part of v3d_qpu_input ++ */ ++ *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A); ++ *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B); ++ } + + if (!v3d_qpu_add_pack(devinfo, instr, packed_instr)) + return false; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0017-broadcom-compiler-update-node-temp-translation-for-v.patch b/projects/RPi/devices/RPi5/patches/mesa/0017-broadcom-compiler-update-node-temp-translation-for-v.patch new file mode 100644 index 0000000000..0bf1274d45 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0017-broadcom-compiler-update-node-temp-translation-for-v.patch @@ -0,0 +1,261 @@ +From ebba9019461083687f6afd23ff0d4646c1a667cb Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Sun, 29 Jan 2023 00:27:11 +0100 +Subject: [PATCH 017/142] broadcom/compiler: update node/temp translation for + v71 + +As the offset applied needs to take into account if we have +accumulators or not. +--- + src/broadcom/compiler/vir_register_allocate.c | 68 +++++++++---------- + 1 file changed, 34 insertions(+), 34 deletions(-) + +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index b22f915d1df..aa9473d124b 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -39,30 +39,31 @@ + CLASS_BITS_R5) + + static inline uint32_t +-temp_to_node(uint32_t temp) ++temp_to_node(struct v3d_compile *c, uint32_t temp) + { +- return temp + ACC_COUNT; ++ return temp + (c->devinfo->has_accumulators ? ACC_COUNT : 0); + } + + static inline uint32_t +-node_to_temp(uint32_t node) ++node_to_temp(struct v3d_compile *c, uint32_t node) + { +- assert(node >= ACC_COUNT); +- return node - ACC_COUNT; ++ assert((c->devinfo->has_accumulators && node >= ACC_COUNT) || ++ (!c->devinfo->has_accumulators && node >= 0)); ++ return node - (c->devinfo->has_accumulators ? ACC_COUNT : 0); + } + + static inline uint8_t +-get_temp_class_bits(struct v3d_ra_node_info *nodes, ++get_temp_class_bits(struct v3d_compile *c, + uint32_t temp) + { +- return nodes->info[temp_to_node(temp)].class_bits; ++ return c->nodes.info[temp_to_node(c, temp)].class_bits; + } + + static inline void +-set_temp_class_bits(struct v3d_ra_node_info *nodes, ++set_temp_class_bits(struct v3d_compile *c, + uint32_t temp, uint8_t class_bits) + { +- nodes->info[temp_to_node(temp)].class_bits = class_bits; ++ c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits; + } + + static struct ra_class * +@@ -84,7 +85,7 @@ static inline struct ra_class * + choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp) + { + assert(temp < c->num_temps && temp < c->nodes.alloc_count); +- return choose_reg_class(c, get_temp_class_bits(&c->nodes, temp)); ++ return choose_reg_class(c, get_temp_class_bits(c, temp)); + } + + static inline bool +@@ -313,7 +314,7 @@ v3d_choose_spill_node(struct v3d_compile *c) + + for (unsigned i = 0; i < c->num_temps; i++) { + if (BITSET_TEST(c->spillable, i)) { +- ra_set_node_spill_cost(c->g, temp_to_node(i), ++ ra_set_node_spill_cost(c->g, temp_to_node(c, i), + spill_costs[i]); + } + } +@@ -482,7 +483,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c, + c->temp_start[i] < ip && c->temp_end[i] >= ip : + c->temp_start[i] <= ip && c->temp_end[i] > ip; + if (thrsw_cross) { +- ra_set_node_class(c->g, temp_to_node(i), ++ ra_set_node_class(c->g, temp_to_node(c, i), + choose_reg_class(c, CLASS_BITS_PHYS)); + } + } +@@ -509,8 +510,7 @@ v3d_emit_tmu_spill(struct v3d_compile *c, + * same register class bits as the original. + */ + if (inst == position) { +- uint8_t class_bits = get_temp_class_bits(&c->nodes, +- inst->dst.index); ++ uint8_t class_bits = get_temp_class_bits(c, inst->dst.index); + inst->dst = vir_get_temp(c); + add_node(c, inst->dst.index, class_bits); + } else { +@@ -574,7 +574,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) + reconstruct_op = orig_def->qpu.alu.add.op; + } + +- uint32_t spill_node = temp_to_node(spill_temp); ++ uint32_t spill_node = temp_to_node(c, spill_temp); + + /* We must disable the ldunif optimization if we are spilling uniforms */ + bool had_disable_ldunif_opt = c->disable_ldunif_opt; +@@ -739,12 +739,12 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) + * update node priorities based one new liveness data. + */ + uint32_t sb_temp =c->spill_base.index; +- uint32_t sb_node = temp_to_node(sb_temp); ++ uint32_t sb_node = temp_to_node(c, sb_temp); + for (uint32_t i = 0; i < c->num_temps; i++) { + if (c->temp_end[i] == -1) + continue; + +- uint32_t node_i = temp_to_node(i); ++ uint32_t node_i = temp_to_node(c, i); + c->nodes.info[node_i].priority = + c->temp_end[i] - c->temp_start[i]; + +@@ -752,7 +752,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) + j < c->num_temps; j++) { + if (interferes(c->temp_start[i], c->temp_end[i], + c->temp_start[j], c->temp_end[j])) { +- uint32_t node_j = temp_to_node(j); ++ uint32_t node_j = temp_to_node(c, j); + ra_add_node_interference(c->g, node_i, node_j); + } + } +@@ -958,7 +958,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { + ra_add_node_interference(c->g, +- temp_to_node(i), ++ temp_to_node(c, i), + acc_nodes[3]); + } + } +@@ -968,7 +968,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { + ra_add_node_interference(c->g, +- temp_to_node(i), ++ temp_to_node(c, i), + acc_nodes[4]); + } + } +@@ -987,7 +987,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + * decides whether the LDVPM is in or out) + */ + assert(inst->dst.file == QFILE_TEMP); +- set_temp_class_bits(&c->nodes, inst->dst.index, ++ set_temp_class_bits(c, inst->dst.index, + CLASS_BITS_PHYS); + break; + } +@@ -1002,7 +1002,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + * phys regfile. + */ + assert(inst->dst.file == QFILE_TEMP); +- set_temp_class_bits(&c->nodes, inst->dst.index, ++ set_temp_class_bits(c, inst->dst.index, + CLASS_BITS_PHYS); + break; + } +@@ -1024,7 +1024,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + */ + assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV); + assert(inst->dst.file == QFILE_TEMP); +- uint32_t node = temp_to_node(inst->dst.index); ++ uint32_t node = temp_to_node(c, inst->dst.index); + ra_set_node_reg(c->g, node, + PHYS_INDEX + inst->src[0].index); + break; +@@ -1043,9 +1043,9 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + */ + if (!inst->qpu.sig.ldunif) { + uint8_t class_bits = +- get_temp_class_bits(&c->nodes, inst->dst.index) & ++ get_temp_class_bits(c, inst->dst.index) & + ~CLASS_BITS_R5; +- set_temp_class_bits(&c->nodes, inst->dst.index, ++ set_temp_class_bits(c, inst->dst.index, + class_bits); + + } else { +@@ -1054,7 +1054,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + * loads interfere with each other. + */ + if (c->devinfo->ver < 40) { +- set_temp_class_bits(&c->nodes, inst->dst.index, ++ set_temp_class_bits(c, inst->dst.index, + CLASS_BITS_R5); + } + } +@@ -1064,7 +1064,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + if (inst->qpu.sig.thrsw) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { +- set_temp_class_bits(&c->nodes, i, ++ set_temp_class_bits(c, i, + CLASS_BITS_PHYS); + } + } +@@ -1125,7 +1125,7 @@ v3d_register_allocate(struct v3d_compile *c) + c->nodes.info[i].priority = 0; + c->nodes.info[i].class_bits = 0; + } else { +- uint32_t t = node_to_temp(i); ++ uint32_t t = node_to_temp(c, i); + c->nodes.info[i].priority = + c->temp_end[t] - c->temp_start[t]; + c->nodes.info[i].class_bits = CLASS_BITS_ANY; +@@ -1143,7 +1143,7 @@ v3d_register_allocate(struct v3d_compile *c) + + /* Set the register classes for all our temporaries in the graph */ + for (uint32_t i = 0; i < c->num_temps; i++) { +- ra_set_node_class(c->g, temp_to_node(i), ++ ra_set_node_class(c->g, temp_to_node(c, i), + choose_reg_class_for_temp(c, i)); + } + +@@ -1153,8 +1153,8 @@ v3d_register_allocate(struct v3d_compile *c) + if (interferes(c->temp_start[i], c->temp_end[i], + c->temp_start[j], c->temp_end[j])) { + ra_add_node_interference(c->g, +- temp_to_node(i), +- temp_to_node(j)); ++ temp_to_node(c, i), ++ temp_to_node(c, j)); + } + } + } +@@ -1171,7 +1171,7 @@ v3d_register_allocate(struct v3d_compile *c) + if (c->spill_size < + V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) { + int node = v3d_choose_spill_node(c); +- uint32_t temp = node_to_temp(node); ++ uint32_t temp = node_to_temp(c, node); + if (node != -1) { + v3d_spill_reg(c, acc_nodes, temp); + continue; +@@ -1186,7 +1186,7 @@ v3d_register_allocate(struct v3d_compile *c) + if (node == -1) + goto spill_fail; + +- uint32_t temp = node_to_temp(node); ++ uint32_t temp = node_to_temp(c, node); + enum temp_spill_type spill_type = + get_spill_type_for_temp(c, temp); + if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) { +@@ -1201,7 +1201,7 @@ v3d_register_allocate(struct v3d_compile *c) + /* Allocation was successful, build the 'temp -> reg' map */ + temp_registers = calloc(c->num_temps, sizeof(*temp_registers)); + for (uint32_t i = 0; i < c->num_temps; i++) { +- int ra_reg = ra_get_node_reg(c->g, temp_to_node(i)); ++ int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i)); + if (ra_reg < PHYS_INDEX) { + temp_registers[i].magic = true; + temp_registers[i].index = (V3D_QPU_WADDR_R0 + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0018-broadcom-compiler-phys-index-depends-on-hw-version.patch b/projects/RPi/devices/RPi5/patches/mesa/0018-broadcom-compiler-phys-index-depends-on-hw-version.patch new file mode 100644 index 0000000000..88f753bb0b --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0018-broadcom-compiler-phys-index-depends-on-hw-version.patch @@ -0,0 +1,144 @@ +From 9b2dfe0286212aba3687a06023cc5b4ce9944ee0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Mon, 23 Aug 2021 02:18:43 +0200 +Subject: [PATCH 018/142] broadcom/compiler: phys index depends on hw version + +For 7.1 there are not accumulators. So we replace the macro with a +function call. +--- + src/broadcom/compiler/vir_register_allocate.c | 39 ++++++++++++++----- + 1 file changed, 29 insertions(+), 10 deletions(-) + +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index aa9473d124b..a358b616e13 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -28,9 +28,19 @@ + + #define ACC_INDEX 0 + #define ACC_COUNT 6 +-#define PHYS_INDEX (ACC_INDEX + ACC_COUNT) +-#define PHYS_COUNT 64 + ++#define PHYS_COUNT 64 ++ ++static uint8_t ++get_phys_index(const struct v3d_device_info *devinfo) ++{ ++ if (devinfo->has_accumulators) ++ return ACC_INDEX + ACC_COUNT; ++ else ++ return 0; ++} ++ ++/* ACC as accumulator */ + #define CLASS_BITS_PHYS (1 << 0) + #define CLASS_BITS_ACC (1 << 1) + #define CLASS_BITS_R5 (1 << 4) +@@ -771,9 +781,11 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) + } + + struct v3d_ra_select_callback_data { ++ uint32_t phys_index; + uint32_t next_acc; + uint32_t next_phys; + struct v3d_ra_node_info *nodes; ++ const struct v3d_device_info *devinfo; + }; + + /* Choosing accumulators improves chances of merging QPU instructions +@@ -794,7 +806,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra, + static const int available_rf_threshold = 5; + int available_rf = 0 ; + for (int i = 0; i < PHYS_COUNT; i++) { +- if (BITSET_TEST(regs, PHYS_INDEX + i)) ++ if (BITSET_TEST(regs, v3d_ra->phys_index + i)) + available_rf++; + if (available_rf >= available_rf_threshold) + break; +@@ -854,7 +866,7 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, + { + for (int i = 0; i < PHYS_COUNT; i++) { + int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT; +- int phys = PHYS_INDEX + phys_off; ++ int phys = v3d_ra->phys_index + phys_off; + + if (BITSET_TEST(regs, phys)) { + v3d_ra->next_phys = phys_off + 1; +@@ -896,8 +908,9 @@ vir_init_reg_sets(struct v3d_compiler *compiler) + * register file can be divided up for fragment shader threading. + */ + int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3); ++ uint8_t phys_index = get_phys_index(compiler->devinfo); + +- compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT, ++ compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT, + false); + if (!compiler->regs) + return false; +@@ -912,8 +925,8 @@ vir_init_reg_sets(struct v3d_compiler *compiler) + compiler->reg_class_phys[threads] = + ra_alloc_contig_reg_class(compiler->regs, 1); + +- for (int i = PHYS_INDEX; +- i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) { ++ for (int i = phys_index; ++ i < phys_index + (PHYS_COUNT >> threads); i++) { + ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); + ra_class_add_reg(compiler->reg_class_phys[threads], i); + ra_class_add_reg(compiler->reg_class_any[threads], i); +@@ -1026,7 +1039,8 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + assert(inst->dst.file == QFILE_TEMP); + uint32_t node = temp_to_node(c, inst->dst.index); + ra_set_node_reg(c->g, node, +- PHYS_INDEX + inst->src[0].index); ++ get_phys_index(c->devinfo) + ++ inst->src[0].index); + break; + } + } +@@ -1086,13 +1100,17 @@ v3d_register_allocate(struct v3d_compile *c) + c->num_temps + ACC_COUNT), + }; + ++ uint32_t phys_index = get_phys_index(c->devinfo); ++ + struct v3d_ra_select_callback_data callback_data = { ++ .phys_index = phys_index, + .next_acc = 0, + /* Start at RF3, to try to keep the TLB writes from using + * RF0-2. + */ + .next_phys = 3, + .nodes = &c->nodes, ++ .devinfo = c->devinfo, + }; + + vir_calculate_live_intervals(c); +@@ -1139,6 +1157,7 @@ v3d_register_allocate(struct v3d_compile *c) + vir_for_each_inst_inorder(inst, c) { + inst->ip = ip++; + update_graph_and_reg_classes_for_inst(c, acc_nodes, inst); ++ + } + + /* Set the register classes for all our temporaries in the graph */ +@@ -1202,13 +1221,13 @@ v3d_register_allocate(struct v3d_compile *c) + temp_registers = calloc(c->num_temps, sizeof(*temp_registers)); + for (uint32_t i = 0; i < c->num_temps; i++) { + int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i)); +- if (ra_reg < PHYS_INDEX) { ++ if (ra_reg < phys_index) { + temp_registers[i].magic = true; + temp_registers[i].index = (V3D_QPU_WADDR_R0 + + ra_reg - ACC_INDEX); + } else { + temp_registers[i].magic = false; +- temp_registers[i].index = ra_reg - PHYS_INDEX; ++ temp_registers[i].index = ra_reg - phys_index; + } + } + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0019-broadcom-compiler-don-t-favor-select-accum-registers.patch b/projects/RPi/devices/RPi5/patches/mesa/0019-broadcom-compiler-don-t-favor-select-accum-registers.patch new file mode 100644 index 0000000000..6689d6ee7f --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0019-broadcom-compiler-don-t-favor-select-accum-registers.patch @@ -0,0 +1,40 @@ +From da0a3deadf86a46c8323267d3f6a49e442835608 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Fri, 17 Sep 2021 01:07:06 +0200 +Subject: [PATCH 019/142] broadcom/compiler: don't favor/select accum registers + for hw not supporting it + +Note that what we do is to just return false on the favor/select accum +methods. We could just avoid to call them, but as the select is called +more than once, it is just easier this way. +--- + src/broadcom/compiler/vir_register_allocate.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index a358b616e13..1f495180784 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -797,6 +797,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra, + BITSET_WORD *regs, + int priority) + { ++ if (!v3d_ra->devinfo->has_accumulators) ++ return false; ++ + /* Favor accumulators if we have less that this number of physical + * registers. Accumulators have more restrictions (like being + * invalidated through thrsw), so running out of physical registers +@@ -832,6 +835,9 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra, + BITSET_WORD *regs, + unsigned int *out) + { ++ if (!v3d_ra->devinfo->has_accumulators) ++ return false; ++ + /* Choose r5 for our ldunifs if possible (nobody else can load to that + * reg, and it keeps the QPU cond field free from being occupied by + * ldunifrf). +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0020-broadcom-vir-implement-is_no_op_mov-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0020-broadcom-vir-implement-is_no_op_mov-for-v71.patch new file mode 100644 index 0000000000..3085733d38 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0020-broadcom-vir-implement-is_no_op_mov-for-v71.patch @@ -0,0 +1,105 @@ +From 6c04d7c917da6b38f8b2b4306ab03ed2ab7e6ce0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 9 Sep 2021 00:28:53 +0200 +Subject: [PATCH 020/142] broadcom/vir: implement is_no_op_mov for v71 + +Did some refactoring/splitting. +--- + src/broadcom/compiler/vir_to_qpu.c | 66 ++++++++++++++++++++++++------ + 1 file changed, 53 insertions(+), 13 deletions(-) + +diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c +index c8b6e0a91a0..08970d52954 100644 +--- a/src/broadcom/compiler/vir_to_qpu.c ++++ b/src/broadcom/compiler/vir_to_qpu.c +@@ -129,19 +129,8 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) + } + + static bool +-is_no_op_mov(struct qinst *qinst) ++v3d33_mov_src_and_dst_equal(struct qinst *qinst) + { +- static const struct v3d_qpu_sig no_sig = {0}; +- +- /* Make sure it's just a lone MOV. */ +- if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || +- qinst->qpu.alu.mul.op != V3D_QPU_M_MOV || +- qinst->qpu.alu.add.op != V3D_QPU_A_NOP || +- memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) { +- return false; +- } +- +- /* Check if it's a MOV from a register to itself. */ + enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr; + if (qinst->qpu.alu.mul.magic_write) { + if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4) +@@ -168,6 +157,57 @@ is_no_op_mov(struct qinst *qinst) + return false; + } + ++ return true; ++} ++ ++static bool ++v3d71_mov_src_and_dst_equal(struct qinst *qinst) ++{ ++ if (qinst->qpu.alu.mul.magic_write) ++ return false; ++ ++ enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr; ++ int raddr; ++ ++ raddr = qinst->qpu.alu.mul.a.raddr; ++ if (raddr != waddr) ++ return false; ++ ++ return true; ++} ++ ++static bool ++mov_src_and_dst_equal(struct qinst *qinst, ++ const struct v3d_device_info *devinfo) ++{ ++ if (devinfo->ver < 71) ++ return v3d33_mov_src_and_dst_equal(qinst); ++ else ++ return v3d71_mov_src_and_dst_equal(qinst); ++} ++ ++ ++static bool ++is_no_op_mov(struct qinst *qinst, ++ const struct v3d_device_info *devinfo) ++{ ++ static const struct v3d_qpu_sig no_sig = {0}; ++ ++ /* Make sure it's just a lone MOV. We only check for M_MOV. Although ++ * for V3D 7.x there is also A_MOV, we don't need to check for it as ++ * we always emit using M_MOV. We could use A_MOV later on the ++ * squedule to improve performance ++ */ ++ if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || ++ qinst->qpu.alu.mul.op != V3D_QPU_M_MOV || ++ qinst->qpu.alu.add.op != V3D_QPU_A_NOP || ++ memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) { ++ return false; ++ } ++ ++ if (!mov_src_and_dst_equal(qinst, devinfo)) ++ return false; ++ + /* No packing or flags updates, or we need to execute the + * instruction. + */ +@@ -324,7 +364,7 @@ v3d_generate_code_block(struct v3d_compile *c, + qinst->qpu.alu.mul.waddr = dst.index; + qinst->qpu.alu.mul.magic_write = dst.magic; + +- if (is_no_op_mov(qinst)) { ++ if (is_no_op_mov(qinst, c->devinfo)) { + vir_remove_instruction(c, qinst); + continue; + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0021-broadcom-compiler-update-vir_to_qpu-set_src-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0021-broadcom-compiler-update-vir_to_qpu-set_src-for-v71.patch new file mode 100644 index 0000000000..57bd1ad620 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0021-broadcom-compiler-update-vir_to_qpu-set_src-for-v71.patch @@ -0,0 +1,104 @@ +From 7b5be2d9b178a45c34c22db2744639a6a8a216d1 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 9 Sep 2021 01:18:54 +0200 +Subject: [PATCH 021/142] broadcom/compiler: update vir_to_qpu::set_src for v71 + +--- + src/broadcom/compiler/vir_to_qpu.c | 47 ++++++++++++++++++++++++++---- + 1 file changed, 42 insertions(+), 5 deletions(-) + +diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c +index 08970d52954..afc4941fdb1 100644 +--- a/src/broadcom/compiler/vir_to_qpu.c ++++ b/src/broadcom/compiler/vir_to_qpu.c +@@ -86,12 +86,22 @@ new_qpu_nop_before(struct qinst *inst) + return q; + } + ++static void ++v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src) ++{ ++ if (src.smimm) ++ unreachable("v3d71_set_src: pending handling small immediates"); ++ ++ assert(!src.magic); ++ *raddr = src.index; ++} ++ + /** + * Allocates the src register (accumulator or register file) into the RADDR + * fields of the instruction. + */ + static void +-set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) ++v3d33_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) + { + if (src.smimm) { + assert(instr->sig.small_imm_b); +@@ -128,6 +138,24 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) + } + } + ++/* ++ * The main purpose of the following wrapper is to make calling set_src ++ * cleaner. This is the reason it receives both mux and raddr pointers. Those ++ * will be filled or not based on the device version. ++ */ ++static void ++set_src(struct v3d_qpu_instr *instr, ++ enum v3d_qpu_mux *mux, ++ uint8_t *raddr, ++ struct qpu_reg src, ++ const struct v3d_device_info *devinfo) ++{ ++ if (devinfo->ver < 71) ++ return v3d33_set_src(instr, mux, src); ++ else ++ return v3d71_set_src(instr, raddr, src); ++} ++ + static bool + v3d33_mov_src_and_dst_equal(struct qinst *qinst) + { +@@ -340,13 +368,18 @@ v3d_generate_code_block(struct v3d_compile *c, + qinst->qpu.sig_magic = dst.magic; + } else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) { + assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); ++ + if (nsrc >= 1) { + set_src(&qinst->qpu, +- &qinst->qpu.alu.add.a.mux, src[0]); ++ &qinst->qpu.alu.add.a.mux, ++ &qinst->qpu.alu.add.a.raddr, ++ src[0], c->devinfo); + } + if (nsrc >= 2) { + set_src(&qinst->qpu, +- &qinst->qpu.alu.add.b.mux, src[1]); ++ &qinst->qpu.alu.add.b.mux, ++ &qinst->qpu.alu.add.b.raddr, ++ src[1], c->devinfo); + } + + qinst->qpu.alu.add.waddr = dst.index; +@@ -354,11 +387,15 @@ v3d_generate_code_block(struct v3d_compile *c, + } else { + if (nsrc >= 1) { + set_src(&qinst->qpu, +- &qinst->qpu.alu.mul.a.mux, src[0]); ++ &qinst->qpu.alu.mul.a.mux, ++ &qinst->qpu.alu.mul.a.raddr, ++ src[0], c->devinfo); + } + if (nsrc >= 2) { + set_src(&qinst->qpu, +- &qinst->qpu.alu.mul.b.mux, src[1]); ++ &qinst->qpu.alu.mul.b.mux, ++ &qinst->qpu.alu.mul.b.raddr, ++ src[1], c->devinfo); + } + + qinst->qpu.alu.mul.waddr = dst.index; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0022-broadcom-qpu_schedule-add-process_raddr_deps.patch b/projects/RPi/devices/RPi5/patches/mesa/0022-broadcom-qpu_schedule-add-process_raddr_deps.patch new file mode 100644 index 0000000000..519e72d917 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0022-broadcom-qpu_schedule-add-process_raddr_deps.patch @@ -0,0 +1,92 @@ +From fe89703008f2a3d6bfe6e260791f712013be5e48 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 9 Sep 2021 23:59:28 +0200 +Subject: [PATCH 022/142] broadcom/qpu_schedule: add process_raddr_deps + +On v71 we don't have muxes, but more raddr. Adding a equivalent add +deps function. +--- + src/broadcom/compiler/qpu_schedule.c | 52 +++++++++++++++++++++++----- + 1 file changed, 44 insertions(+), 8 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index 455fa3867be..89254643c90 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -155,6 +155,7 @@ static void + process_mux_deps(struct schedule_state *state, struct schedule_node *n, + enum v3d_qpu_mux mux) + { ++ assert(state->devinfo->ver < 71); + switch (mux) { + case V3D_QPU_MUX_A: + add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); +@@ -171,6 +172,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n, + } + } + ++ ++static void ++process_raddr_deps(struct schedule_state *state, struct schedule_node *n, ++ uint8_t raddr, bool is_small_imm) ++{ ++ assert(state->devinfo->ver >= 71); ++ ++ if (!is_small_imm) ++ add_read_dep(state, state->last_rf[raddr], n); ++} ++ + static bool + tmu_write_is_sequence_terminator(uint32_t waddr) + { +@@ -305,15 +317,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) + + /* XXX: LOAD_IMM */ + +- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) +- process_mux_deps(state, n, inst->alu.add.a.mux); +- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) +- process_mux_deps(state, n, inst->alu.add.b.mux); ++ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) { ++ if (devinfo->ver < 71) { ++ process_mux_deps(state, n, inst->alu.add.a.mux); ++ } else { ++ process_raddr_deps(state, n, inst->alu.add.a.raddr, ++ inst->sig.small_imm_a); ++ } ++ } ++ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) { ++ if (devinfo->ver < 71) { ++ process_mux_deps(state, n, inst->alu.add.b.mux); ++ } else { ++ process_raddr_deps(state, n, inst->alu.add.b.raddr, ++ inst->sig.small_imm_b); ++ } ++ } + +- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) +- process_mux_deps(state, n, inst->alu.mul.a.mux); +- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) +- process_mux_deps(state, n, inst->alu.mul.b.mux); ++ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) { ++ if (devinfo->ver < 71) { ++ process_mux_deps(state, n, inst->alu.mul.a.mux); ++ } else { ++ process_raddr_deps(state, n, inst->alu.mul.a.raddr, ++ inst->sig.small_imm_c); ++ } ++ } ++ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) { ++ if (devinfo->ver < 71) { ++ process_mux_deps(state, n, inst->alu.mul.b.mux); ++ } else { ++ process_raddr_deps(state, n, inst->alu.mul.b.raddr, ++ inst->sig.small_imm_d); ++ } ++ } + + switch (inst->alu.add.op) { + case V3D_QPU_A_VPMSETUP: +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0023-broadcom-qpu-update-disasm_raddr-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0023-broadcom-qpu-update-disasm_raddr-for-v71.patch new file mode 100644 index 0000000000..e16ff0f540 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0023-broadcom-qpu-update-disasm_raddr-for-v71.patch @@ -0,0 +1,128 @@ +From 20ce426df1ab2546332141f4bc4531ada754cdea Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Fri, 10 Sep 2021 01:20:44 +0200 +Subject: [PATCH 023/142] broadcom/qpu: update disasm_raddr for v71 + +--- + src/broadcom/qpu/qpu_disasm.c | 72 ++++++++++++++++++++++++++++++++--- + 1 file changed, 66 insertions(+), 6 deletions(-) + +diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c +index 588a665f770..b613de781dc 100644 +--- a/src/broadcom/qpu/qpu_disasm.c ++++ b/src/broadcom/qpu/qpu_disasm.c +@@ -56,8 +56,9 @@ pad_to(struct disasm_state *disasm, int n) + + + static void +-v3d_qpu_disasm_raddr(struct disasm_state *disasm, +- const struct v3d_qpu_instr *instr, uint8_t mux) ++v3d33_qpu_disasm_raddr(struct disasm_state *disasm, ++ const struct v3d_qpu_instr *instr, ++ enum v3d_qpu_mux mux) + { + if (mux == V3D_QPU_MUX_A) { + append(disasm, "rf%d", instr->raddr_a); +@@ -82,6 +83,65 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm, + } + } + ++enum v3d_qpu_input_class { ++ V3D_QPU_ADD_A, ++ V3D_QPU_ADD_B, ++ V3D_QPU_MUL_A, ++ V3D_QPU_MUL_B ++}; ++ ++static void ++v3d71_qpu_disasm_raddr(struct disasm_state *disasm, ++ const struct v3d_qpu_instr *instr, ++ uint8_t raddr, ++ enum v3d_qpu_input_class input_class) ++{ ++ bool is_small_imm = false; ++ switch(input_class) { ++ case V3D_QPU_ADD_A: ++ is_small_imm = instr->sig.small_imm_a; ++ break; ++ case V3D_QPU_ADD_B: ++ is_small_imm = instr->sig.small_imm_b; ++ break; ++ case V3D_QPU_MUL_A: ++ is_small_imm = instr->sig.small_imm_c; ++ break; ++ case V3D_QPU_MUL_B: ++ is_small_imm = instr->sig.small_imm_d; ++ break; ++ } ++ ++ if (is_small_imm) { ++ unreachable("Pending handling small immediates"); ++ uint32_t val; ++ ASSERTED bool ok = ++ v3d_qpu_small_imm_unpack(disasm->devinfo, ++ raddr, ++ &val); ++ ++ if ((int)val >= -16 && (int)val <= 15) ++ append(disasm, "%d", val); ++ else ++ append(disasm, "0x%08x", val); ++ assert(ok); ++ } else { ++ append(disasm, "rf%d", raddr); ++ } ++} ++ ++static void ++v3d_qpu_disasm_raddr(struct disasm_state *disasm, ++ const struct v3d_qpu_instr *instr, ++ const struct v3d_qpu_input *input, ++ enum v3d_qpu_input_class input_class) ++{ ++ if (disasm->devinfo->ver < 71) ++ v3d33_qpu_disasm_raddr(disasm, instr, input->mux); ++ else ++ v3d71_qpu_disasm_raddr(disasm, instr, input->raddr, input_class); ++} ++ + static void + v3d_qpu_disasm_waddr(struct disasm_state *disasm, uint32_t waddr, bool magic) + { +@@ -121,14 +181,14 @@ v3d_qpu_disasm_add(struct disasm_state *disasm, + if (num_src >= 1) { + if (has_dst) + append(disasm, ", "); +- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a.mux); ++ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.a, V3D_QPU_ADD_A); + append(disasm, "%s", + v3d_qpu_unpack_name(instr->alu.add.a.unpack)); + } + + if (num_src >= 2) { + append(disasm, ", "); +- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b.mux); ++ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.b, V3D_QPU_ADD_B); + append(disasm, "%s", + v3d_qpu_unpack_name(instr->alu.add.b.unpack)); + } +@@ -164,14 +224,14 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm, + if (num_src >= 1) { + if (has_dst) + append(disasm, ", "); +- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a.mux); ++ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.a, V3D_QPU_MUL_A); + append(disasm, "%s", + v3d_qpu_unpack_name(instr->alu.mul.a.unpack)); + } + + if (num_src >= 2) { + append(disasm, ", "); +- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b.mux); ++ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.b, V3D_QPU_MUL_B); + append(disasm, "%s", + v3d_qpu_unpack_name(instr->alu.mul.b.unpack)); + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0024-broadcom-qpu-return-false-on-qpu_writes_accumulatorX.patch b/projects/RPi/devices/RPi5/patches/mesa/0024-broadcom-qpu-return-false-on-qpu_writes_accumulatorX.patch new file mode 100644 index 0000000000..3b82c34ea8 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0024-broadcom-qpu-return-false-on-qpu_writes_accumulatorX.patch @@ -0,0 +1,59 @@ +From 7263fa24a3c57b1dcd4d870670cda86ae89aa28c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 15 Sep 2021 10:55:49 +0200 +Subject: [PATCH 024/142] broadcom/qpu: return false on + qpu_writes_accumulatorXX helpers for v71 + +As for v71 doesn't have accumulators (devinfo->has_accumulators set to +false), those methods would always return false. +--- + src/broadcom/qpu/qpu_instr.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c +index 8de99c611d5..7ec3c867260 100644 +--- a/src/broadcom/qpu/qpu_instr.c ++++ b/src/broadcom/qpu/qpu_instr.c +@@ -854,6 +854,9 @@ bool + v3d_qpu_writes_r3(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) + { ++ if(!devinfo->has_accumulators) ++ return false; ++ + if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R3)) + return true; + +@@ -864,6 +867,9 @@ bool + v3d_qpu_writes_r4(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) + { ++ if (!devinfo->has_accumulators) ++ return false; ++ + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { + if (inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && +@@ -894,6 +900,9 @@ bool + v3d_qpu_writes_r5(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) + { ++ if (!devinfo->has_accumulators) ++ return false; ++ + if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R5)) + return true; + +@@ -904,6 +913,9 @@ bool + v3d_qpu_writes_accum(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) + { ++ if (!devinfo->has_accumulators) ++ return false; ++ + if (v3d_qpu_writes_r5(devinfo, inst)) + return true; + if (v3d_qpu_writes_r4(devinfo, inst)) +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0025-broadcom-compiler-add-support-for-varyings-on-nir-to.patch b/projects/RPi/devices/RPi5/patches/mesa/0025-broadcom-compiler-add-support-for-varyings-on-nir-to.patch new file mode 100644 index 0000000000..2552764a9e --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0025-broadcom-compiler-add-support-for-varyings-on-nir-to.patch @@ -0,0 +1,116 @@ +From 6a9611c5a22218388bba419174d3343e0cdf773b Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Tue, 14 Sep 2021 10:42:55 +0200 +Subject: [PATCH 025/142] broadcom/compiler: add support for varyings on nir to + vir generation for v71 + +Needs update as v71 doesn't have accumulators anymore, and ldvary uses +now rf0 to return the value. +--- + src/broadcom/compiler/nir_to_vir.c | 34 +++++++++++++++++------------- + 1 file changed, 19 insertions(+), 15 deletions(-) + +diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c +index ca072971f01..79a22c3bd08 100644 +--- a/src/broadcom/compiler/nir_to_vir.c ++++ b/src/broadcom/compiler/nir_to_vir.c +@@ -1005,32 +1005,36 @@ emit_fragcoord_input(struct v3d_compile *c, int attr) + + static struct qreg + emit_smooth_varying(struct v3d_compile *c, +- struct qreg vary, struct qreg w, struct qreg r5) ++ struct qreg vary, struct qreg w, struct qreg c_reg) + { +- return vir_FADD(c, vir_FMUL(c, vary, w), r5); ++ return vir_FADD(c, vir_FMUL(c, vary, w), c_reg); + } + + static struct qreg + emit_noperspective_varying(struct v3d_compile *c, +- struct qreg vary, struct qreg r5) ++ struct qreg vary, struct qreg c_reg) + { +- return vir_FADD(c, vir_MOV(c, vary), r5); ++ return vir_FADD(c, vir_MOV(c, vary), c_reg); + } + + static struct qreg + emit_flat_varying(struct v3d_compile *c, +- struct qreg vary, struct qreg r5) ++ struct qreg vary, struct qreg c_reg) + { + vir_MOV_dest(c, c->undef, vary); +- return vir_MOV(c, r5); ++ return vir_MOV(c, c_reg); + } + + static struct qreg + emit_fragment_varying(struct v3d_compile *c, nir_variable *var, + int8_t input_idx, uint8_t swizzle, int array_index) + { +- struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); +- struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); ++ struct qreg c_reg; /* C coefficient */ ++ ++ if (c->devinfo->has_accumulators) ++ c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); ++ else ++ c_reg = vir_reg(QFILE_REG, 0); + + struct qinst *ldvary = NULL; + struct qreg vary; +@@ -1041,7 +1045,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, + vary = vir_emit_def(c, ldvary); + } else { + vir_NOP(c)->qpu.sig.ldvary = true; +- vary = r3; ++ vary = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); + } + + /* Store the input value before interpolation so we can implement +@@ -1050,7 +1054,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, + if (input_idx >= 0) { + assert(var); + c->interp[input_idx].vp = vary; +- c->interp[input_idx].C = vir_MOV(c, r5); ++ c->interp[input_idx].C = vir_MOV(c, c_reg); + c->interp[input_idx].mode = var->data.interpolation; + } + +@@ -1060,7 +1064,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, + */ + if (!var) { + assert(input_idx < 0); +- return emit_smooth_varying(c, vary, c->payload_w, r5); ++ return emit_smooth_varying(c, vary, c->payload_w, c_reg); + } + + int i = c->num_inputs++; +@@ -1075,20 +1079,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, + if (var->data.centroid) { + BITSET_SET(c->centroid_flags, i); + result = emit_smooth_varying(c, vary, +- c->payload_w_centroid, r5); ++ c->payload_w_centroid, c_reg); + } else { +- result = emit_smooth_varying(c, vary, c->payload_w, r5); ++ result = emit_smooth_varying(c, vary, c->payload_w, c_reg); + } + break; + + case INTERP_MODE_NOPERSPECTIVE: + BITSET_SET(c->noperspective_flags, i); +- result = emit_noperspective_varying(c, vary, r5); ++ result = emit_noperspective_varying(c, vary, c_reg); + break; + + case INTERP_MODE_FLAT: + BITSET_SET(c->flat_shade_flags, i); +- result = emit_flat_varying(c, vary, r5); ++ result = emit_flat_varying(c, vary, c_reg); + break; + + default: +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0026-broadcom-compiler-payload_w-is-loaded-on-rf3-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0026-broadcom-compiler-payload_w-is-loaded-on-rf3-for-v71.patch new file mode 100644 index 0000000000..7302726b66 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0026-broadcom-compiler-payload_w-is-loaded-on-rf3-for-v71.patch @@ -0,0 +1,55 @@ +From 06af15a60f7a9c135893e5f8934b8030c1da95f9 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 15 Sep 2021 01:14:15 +0200 +Subject: [PATCH 026/142] broadcom/compiler: payload_w is loaded on rf3 for v71 + +And in general rf0 is now used for other needs. +--- + src/broadcom/compiler/nir_to_vir.c | 6 +++++- + src/broadcom/compiler/vir_register_allocate.c | 6 +++++- + 2 files changed, 10 insertions(+), 2 deletions(-) + +diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c +index 79a22c3bd08..1a05b279a2d 100644 +--- a/src/broadcom/compiler/nir_to_vir.c ++++ b/src/broadcom/compiler/nir_to_vir.c +@@ -4325,7 +4325,11 @@ nir_to_vir(struct v3d_compile *c) + { + switch (c->s->info.stage) { + case MESA_SHADER_FRAGMENT: +- c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); ++ if (c->devinfo->ver < 71) ++ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); ++ else ++ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3)); ++ + c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); + c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); + +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index 1f495180784..eca9a6751a6 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -1034,6 +1034,11 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + if (inst->src[0].file == QFILE_REG) { + switch (inst->src[0].index) { + case 0: ++ /* V3D 7.x doesn't use rf0 for thread payload */ ++ if (c->devinfo->ver >= 71) ++ break; ++ else ++ FALLTHROUGH; + case 1: + case 2: + case 3: { +@@ -1163,7 +1168,6 @@ v3d_register_allocate(struct v3d_compile *c) + vir_for_each_inst_inorder(inst, c) { + inst->ip = ip++; + update_graph_and_reg_classes_for_inst(c, acc_nodes, inst); +- + } + + /* Set the register classes for all our temporaries in the graph */ +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0027-broadcom-qpu_schedule-update-write-deps-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0027-broadcom-qpu_schedule-update-write-deps-for-v71.patch new file mode 100644 index 0000000000..05010aadd8 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0027-broadcom-qpu_schedule-update-write-deps-for-v71.patch @@ -0,0 +1,30 @@ +From d38d8056903b9a4f96ab56261ac3b3c3be0af4fb Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 15 Sep 2021 11:12:59 +0200 +Subject: [PATCH 027/142] broadcom/qpu_schedule: update write deps for v71 + +We just need to add a write dep if rf0 is written implicitly. + +Note that we don't need to check if we have accumulators when checking +for r3/r4/r5, as v3d_qpu_writes_rX would return false for hw version +that doesn't have accumulators. +--- + src/broadcom/compiler/qpu_schedule.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index 89254643c90..2fa9031d7b6 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -422,6 +422,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) + add_write_dep(state, &state->last_r[4], n); + if (v3d_qpu_writes_r5(devinfo, inst)) + add_write_dep(state, &state->last_r[5], n); ++ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) ++ add_write_dep(state, &state->last_rf[0], n); + + /* If we add any more dependencies here we should consider whether we + * also need to update qpu_inst_after_thrsw_valid_in_delay_slot. +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0028-broadcom-compiler-update-register-classes-to-not-inc.patch b/projects/RPi/devices/RPi5/patches/mesa/0028-broadcom-compiler-update-register-classes-to-not-inc.patch new file mode 100644 index 0000000000..76985d943a --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0028-broadcom-compiler-update-register-classes-to-not-inc.patch @@ -0,0 +1,140 @@ +From 7e2a2be830b1672ab846389a46b5d09bad0f7a98 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 16 Sep 2021 00:49:25 +0200 +Subject: [PATCH 028/142] broadcom/compiler: update register classes to not + include accumulators on v71 + +--- + src/broadcom/compiler/vir_register_allocate.c | 56 ++++++++++++------- + 1 file changed, 36 insertions(+), 20 deletions(-) + +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index eca9a6751a6..7b3f6c41934 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -44,10 +44,15 @@ get_phys_index(const struct v3d_device_info *devinfo) + #define CLASS_BITS_PHYS (1 << 0) + #define CLASS_BITS_ACC (1 << 1) + #define CLASS_BITS_R5 (1 << 4) +-#define CLASS_BITS_ANY (CLASS_BITS_PHYS | \ +- CLASS_BITS_ACC | \ +- CLASS_BITS_R5) + ++static uint8_t ++get_class_bit_any(const struct v3d_device_info *devinfo) ++{ ++ if (devinfo->has_accumulators) ++ return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5); ++ else ++ return CLASS_BITS_PHYS; ++} + static inline uint32_t + temp_to_node(struct v3d_compile *c, uint32_t temp) + { +@@ -82,11 +87,13 @@ choose_reg_class(struct v3d_compile *c, uint8_t class_bits) + if (class_bits == CLASS_BITS_PHYS) { + return c->compiler->reg_class_phys[c->thread_index]; + } else if (class_bits == (CLASS_BITS_R5)) { ++ assert(c->devinfo->has_accumulators); + return c->compiler->reg_class_r5[c->thread_index]; + } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) { ++ assert(c->devinfo->has_accumulators); + return c->compiler->reg_class_phys_or_acc[c->thread_index]; + } else { +- assert(class_bits == CLASS_BITS_ANY); ++ assert(class_bits == get_class_bit_any(c->devinfo)); + return c->compiler->reg_class_any[c->thread_index]; + } + } +@@ -447,7 +454,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c, + */ + assert(c->disable_ldunif_opt); + struct qreg offset = vir_uniform_ui(c, spill_offset); +- add_node(c, offset.index, CLASS_BITS_ANY); ++ add_node(c, offset.index, get_class_bit_any(c->devinfo)); + + /* We always enable per-quad on spills/fills to ensure we spill + * any channels involved with helper invocations. +@@ -645,7 +652,8 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) + * instruction immediately after, so + * we can use any register class for it. + */ +- add_node(c, unif.index, CLASS_BITS_ANY); ++ add_node(c, unif.index, ++ get_class_bit_any(c->devinfo)); + } else if (spill_type == SPILL_TYPE_RECONSTRUCT) { + struct qreg temp = + reconstruct_temp(c, reconstruct_op); +@@ -924,31 +932,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler) + for (int threads = 0; threads < max_thread_index; threads++) { + compiler->reg_class_any[threads] = + ra_alloc_contig_reg_class(compiler->regs, 1); +- compiler->reg_class_r5[threads] = +- ra_alloc_contig_reg_class(compiler->regs, 1); +- compiler->reg_class_phys_or_acc[threads] = +- ra_alloc_contig_reg_class(compiler->regs, 1); ++ if (compiler->devinfo->has_accumulators) { ++ compiler->reg_class_r5[threads] = ++ ra_alloc_contig_reg_class(compiler->regs, 1); ++ compiler->reg_class_phys_or_acc[threads] = ++ ra_alloc_contig_reg_class(compiler->regs, 1); ++ } + compiler->reg_class_phys[threads] = + ra_alloc_contig_reg_class(compiler->regs, 1); + + for (int i = phys_index; + i < phys_index + (PHYS_COUNT >> threads); i++) { +- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); ++ if (compiler->devinfo->has_accumulators) ++ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); + ra_class_add_reg(compiler->reg_class_phys[threads], i); + ra_class_add_reg(compiler->reg_class_any[threads], i); + } + +- for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { +- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); +- ra_class_add_reg(compiler->reg_class_any[threads], i); ++ if (compiler->devinfo->has_accumulators) { ++ for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { ++ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); ++ ra_class_add_reg(compiler->reg_class_any[threads], i); ++ } + } + /* r5 can only store a single 32-bit value, so not much can + * use it. + */ +- ra_class_add_reg(compiler->reg_class_r5[threads], +- ACC_INDEX + 5); +- ra_class_add_reg(compiler->reg_class_any[threads], +- ACC_INDEX + 5); ++ if (compiler->devinfo->has_accumulators) { ++ ra_class_add_reg(compiler->reg_class_r5[threads], ++ ACC_INDEX + 5); ++ ra_class_add_reg(compiler->reg_class_any[threads], ++ ACC_INDEX + 5); ++ } + } + + ra_set_finalize(compiler->regs, NULL); +@@ -1086,7 +1101,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + } + + /* All accumulators are invalidated across a thread switch. */ +- if (inst->qpu.sig.thrsw) { ++ if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { + set_temp_class_bits(c, i, +@@ -1157,7 +1172,8 @@ v3d_register_allocate(struct v3d_compile *c) + uint32_t t = node_to_temp(c, i); + c->nodes.info[i].priority = + c->temp_end[t] - c->temp_start[t]; +- c->nodes.info[i].class_bits = CLASS_BITS_ANY; ++ c->nodes.info[i].class_bits = ++ get_class_bit_any(c->devinfo); + } + } + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0029-broadcom-compiler-implement-reads-writes-too-soon-ch.patch b/projects/RPi/devices/RPi5/patches/mesa/0029-broadcom-compiler-implement-reads-writes-too-soon-ch.patch new file mode 100644 index 0000000000..4af561fa4a --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0029-broadcom-compiler-implement-reads-writes-too-soon-ch.patch @@ -0,0 +1,109 @@ +From 0157228c729b8812dc4900fa24db63b7d27aa342 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Thu, 23 Sep 2021 11:19:58 +0200 +Subject: [PATCH 029/142] broadcom/compiler: implement "reads/writes too soon" + checks for v71 + +--- + src/broadcom/compiler/qpu_schedule.c | 65 ++++++++++++++++++++++------ + 1 file changed, 51 insertions(+), 14 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index 2fa9031d7b6..4db0c2e72da 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -562,7 +562,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard, + } + + static bool +-reads_too_soon_after_write(struct choose_scoreboard *scoreboard, ++reads_too_soon(struct choose_scoreboard *scoreboard, ++ const struct v3d_qpu_instr *inst, uint8_t raddr) ++{ ++ switch (raddr) { ++ case 0: /* ldvary delayed write of C coefficient to rf0 */ ++ if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) ++ return true; ++ break; ++ default: ++ break; ++ } ++ ++ return false; ++} ++ ++static bool ++reads_too_soon_after_write(const struct v3d_device_info *devinfo, ++ struct choose_scoreboard *scoreboard, + struct qinst *qinst) + { + const struct v3d_qpu_instr *inst = &qinst->qpu; +@@ -574,24 +591,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, + assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); + + if (inst->alu.add.op != V3D_QPU_A_NOP) { +- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && +- mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux)) { +- return true; ++ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) { ++ if (devinfo->ver < 71) { ++ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux)) ++ return true; ++ } else { ++ if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr)) ++ return true; ++ } + } +- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && +- mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux)) { +- return true; ++ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) { ++ if (devinfo->ver < 71) { ++ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux)) ++ return true; ++ } else { ++ if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr)) ++ return true; ++ } + } + } + + if (inst->alu.mul.op != V3D_QPU_M_NOP) { +- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && +- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux)) { +- return true; ++ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) { ++ if (devinfo->ver < 71) { ++ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux)) ++ return true; ++ } else { ++ if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr)) ++ return true; ++ } + } +- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && +- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux)) { +- return true; ++ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) { ++ if (devinfo->ver < 71) { ++ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux)) ++ return true; ++ } else { ++ if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr)) ++ return true; ++ } + } + } + +@@ -1147,7 +1184,7 @@ retry: + * regfile A or B that was written to by the previous + * instruction." + */ +- if (reads_too_soon_after_write(scoreboard, n->inst)) ++ if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst)) + continue; + + if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst)) +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0030-broadcom-compiler-implement-read-stall-check-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0030-broadcom-compiler-implement-read-stall-check-for-v71.patch new file mode 100644 index 0000000000..9704a18a6b --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0030-broadcom-compiler-implement-read-stall-check-for-v71.patch @@ -0,0 +1,118 @@ +From 3fb3333bdf9699157cf0a2bd46ba4c25058bc5c1 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Thu, 23 Sep 2021 11:44:59 +0200 +Subject: [PATCH 030/142] broadcom/compiler: implement read stall check for v71 + +--- + src/broadcom/compiler/qpu_schedule.c | 32 +++++++++++++++++----------- + src/broadcom/qpu/qpu_instr.c | 12 +++++++++++ + src/broadcom/qpu/qpu_instr.h | 2 ++ + 3 files changed, 34 insertions(+), 12 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index 4db0c2e72da..b78abe003e9 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -679,29 +679,37 @@ pixel_scoreboard_too_soon(struct v3d_compile *c, + } + + static bool +-qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, ++qpu_instruction_uses_rf(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *inst, + uint32_t waddr) { + + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) + return false; + +- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && +- inst->raddr_a == waddr) +- return true; ++ if (devinfo->ver < 71) { ++ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && ++ inst->raddr_a == waddr) ++ return true; + +- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && +- !inst->sig.small_imm_b && (inst->raddr_b == waddr)) +- return true; ++ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && ++ !inst->sig.small_imm_b && (inst->raddr_b == waddr)) ++ return true; ++ } else { ++ /* FIXME: skip if small immediate */ ++ if (v3d71_qpu_reads_raddr(inst, waddr)) ++ return true; ++ } + + return false; + } + + static bool +-mux_read_stalls(struct choose_scoreboard *scoreboard, +- const struct v3d_qpu_instr *inst) ++read_stalls(const struct v3d_device_info *devinfo, ++ struct choose_scoreboard *scoreboard, ++ const struct v3d_qpu_instr *inst) + { + return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 && +- qpu_instruction_uses_rf(inst, ++ qpu_instruction_uses_rf(devinfo, inst, + scoreboard->last_stallable_sfu_reg); + } + +@@ -1319,7 +1327,7 @@ retry: + + int prio = get_instruction_priority(c->devinfo, inst); + +- if (mux_read_stalls(scoreboard, inst)) { ++ if (read_stalls(c->devinfo, scoreboard, inst)) { + /* Don't merge an instruction that stalls */ + if (prev_inst) + continue; +@@ -2389,7 +2397,7 @@ schedule_instructions(struct v3d_compile *c, + } + } + } +- if (mux_read_stalls(scoreboard, inst)) ++ if (read_stalls(c->devinfo, scoreboard, inst)) + c->qpu_inst_stalled_count++; + } + +diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c +index 7ec3c867260..e8bbb2141b0 100644 +--- a/src/broadcom/qpu/qpu_instr.c ++++ b/src/broadcom/qpu/qpu_instr.c +@@ -956,6 +956,18 @@ v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) + (mul_nsrc > 1 && inst->alu.mul.b.mux == mux)); + } + ++bool ++v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr) ++{ ++ int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op); ++ int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op); ++ ++ return (add_nsrc > 0 && inst->alu.add.a.raddr == raddr) || ++ (add_nsrc > 1 && inst->alu.add.b.raddr == raddr) || ++ (mul_nsrc > 0 && inst->alu.mul.a.raddr == raddr) || ++ (mul_nsrc > 1 && inst->alu.mul.b.raddr == raddr); ++} ++ + bool + v3d_qpu_sig_writes_address(const struct v3d_device_info *devinfo, + const struct v3d_qpu_sig *sig) +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index a25be8e0ee6..9f7582ab06d 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -494,4 +494,6 @@ bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + + bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; ++ ++bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr); + #endif +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0031-broadcom-compiler-add-a-v3d71_qpu_writes_waddr_expli.patch b/projects/RPi/devices/RPi5/patches/mesa/0031-broadcom-compiler-add-a-v3d71_qpu_writes_waddr_expli.patch new file mode 100644 index 0000000000..3aec307f63 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0031-broadcom-compiler-add-a-v3d71_qpu_writes_waddr_expli.patch @@ -0,0 +1,65 @@ +From cbe0a7a06a5fb9b3f28acba8c9cac362a6bc5324 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 6 Oct 2021 13:58:00 +0200 +Subject: [PATCH 031/142] broadcom/compiler: add a + v3d71_qpu_writes_waddr_explicitly helper + +--- + src/broadcom/qpu/qpu_instr.c | 28 ++++++++++++++++++++++++++++ + src/broadcom/qpu/qpu_instr.h | 3 +++ + 2 files changed, 31 insertions(+) + +diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c +index e8bbb2141b0..feb6b343c1c 100644 +--- a/src/broadcom/qpu/qpu_instr.c ++++ b/src/broadcom/qpu/qpu_instr.c +@@ -968,6 +968,34 @@ v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr) + (mul_nsrc > 1 && inst->alu.mul.b.raddr == raddr); + } + ++bool ++v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *inst, ++ uint8_t waddr) ++{ ++ if (inst->type != V3D_QPU_INSTR_TYPE_ALU) ++ return false; ++ ++ if (v3d_qpu_add_op_has_dst(inst->alu.add.op) && ++ !inst->alu.add.magic_write && ++ inst->alu.add.waddr == waddr) { ++ return true; ++ } ++ ++ if (v3d_qpu_mul_op_has_dst(inst->alu.mul.op) && ++ !inst->alu.mul.magic_write && ++ inst->alu.mul.waddr == waddr) { ++ return true; ++ } ++ ++ if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && ++ !inst->sig_magic && inst->sig_addr == waddr) { ++ return true; ++ } ++ ++ return false; ++} ++ + bool + v3d_qpu_sig_writes_address(const struct v3d_device_info *devinfo, + const struct v3d_qpu_sig *sig) +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index 9f7582ab06d..50a69ce8c3a 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -496,4 +496,7 @@ bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + + bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr); ++bool v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *inst, ++ uint8_t waddr); + #endif +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0032-broadcom-compiler-prevent-rf2-3-usage-in-thread-end-.patch b/projects/RPi/devices/RPi5/patches/mesa/0032-broadcom-compiler-prevent-rf2-3-usage-in-thread-end-.patch new file mode 100644 index 0000000000..f5e3fb5f22 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0032-broadcom-compiler-prevent-rf2-3-usage-in-thread-end-.patch @@ -0,0 +1,67 @@ +From 92e91a9b22ae61dc9f39880e8fdaa7714789efdb Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Mon, 27 Sep 2021 11:49:24 +0200 +Subject: [PATCH 032/142] broadcom/compiler: prevent rf2-3 usage in thread end + delay slots for v71 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: Iago Toral Quiroga +Signed-off-by: Alejandro Piñeiro +--- + src/broadcom/compiler/qpu_schedule.c | 37 +++++++++++++++++++++------- + 1 file changed, 28 insertions(+), 9 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index b78abe003e9..839c0c62315 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -1691,16 +1691,35 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, + if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) + return false; + +- /* RF0-2 might be overwritten during the delay slots by +- * fragment shader setup. +- */ +- if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A)) +- return false; ++ if (c->devinfo->ver <= 42) { ++ /* RF0-2 might be overwritten during the delay slots by ++ * fragment shader setup. ++ */ ++ if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A)) ++ return false; + +- if (inst->raddr_b < 3 && +- !inst->sig.small_imm_b && +- v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) { +- return false; ++ if (inst->raddr_b < 3 && ++ !inst->sig.small_imm_b && ++ v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) { ++ return false; ++ } ++ } ++ ++ if (c->devinfo->ver >= 71) { ++ /* RF2-3 might be overwritten during the delay slots by ++ * fragment shader setup. ++ * ++ * FIXME: handle small immediate cases ++ */ ++ if (v3d71_qpu_reads_raddr(inst, 2) || ++ v3d71_qpu_reads_raddr(inst, 3)) { ++ return false; ++ } ++ ++ if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) || ++ v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) { ++ return false; ++ } + } + } + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0033-broadcom-qpu-add-new-ADD-opcodes-for-FMOV-MOV-in-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0033-broadcom-qpu-add-new-ADD-opcodes-for-FMOV-MOV-in-v71.patch new file mode 100644 index 0000000000..4a2b89038b --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0033-broadcom-qpu-add-new-ADD-opcodes-for-FMOV-MOV-in-v71.patch @@ -0,0 +1,78 @@ +From 68a1545eb973e41608534ff05a9e84a86c046453 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Mon, 27 Sep 2021 13:26:04 +0200 +Subject: [PATCH 033/142] broadcom/qpu: add new ADD opcodes for FMOV/MOV in v71 + +--- + src/broadcom/qpu/qpu_instr.c | 5 +++++ + src/broadcom/qpu/qpu_instr.h | 4 ++++ + src/broadcom/qpu/qpu_pack.c | 15 +++++++++++++++ + 3 files changed, 24 insertions(+) + +diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c +index feb6b343c1c..195a0dcd232 100644 +--- a/src/broadcom/qpu/qpu_instr.c ++++ b/src/broadcom/qpu/qpu_instr.c +@@ -177,6 +177,8 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op) + [V3D_QPU_A_ITOF] = "itof", + [V3D_QPU_A_CLZ] = "clz", + [V3D_QPU_A_UTOF] = "utof", ++ [V3D_QPU_A_MOV] = "mov", ++ [V3D_QPU_A_FMOV] = "fmov", + }; + + if (op >= ARRAY_SIZE(op_names)) +@@ -458,6 +460,9 @@ static const uint8_t add_op_args[] = { + [V3D_QPU_A_ITOF] = D | A, + [V3D_QPU_A_CLZ] = D | A, + [V3D_QPU_A_UTOF] = D | A, ++ ++ [V3D_QPU_A_MOV] = D | A, ++ [V3D_QPU_A_FMOV] = D | A, + }; + + static const uint8_t mul_op_args[] = { +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index 50a69ce8c3a..c86a4119c54 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -227,6 +227,10 @@ enum v3d_qpu_add_op { + V3D_QPU_A_ITOF, + V3D_QPU_A_CLZ, + V3D_QPU_A_UTOF, ++ ++ /* V3D 7.x */ ++ V3D_QPU_A_FMOV, ++ V3D_QPU_A_MOV, + }; + + enum v3d_qpu_mul_op { +diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c +index 4045275cb9a..0e504e65fbf 100644 +--- a/src/broadcom/qpu/qpu_pack.c ++++ b/src/broadcom/qpu/qpu_pack.c +@@ -776,6 +776,21 @@ static const struct opcode_desc add_ops_v71[] = { + + { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 }, ++ ++ { 249, 249, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FMOV, 71 }, ++ { 249, 249, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FMOV, 71 }, ++ { 249, 249, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FMOV, 71 }, ++ { 249, 249, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FMOV, 71 }, ++ { 249, 249, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FMOV, 71 }, ++ { 249, 249, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FMOV, 71 }, ++ { 249, 249, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FMOV, 71 }, ++ ++ { 249, 249, .raddr_mask = OP_MASK(3), V3D_QPU_A_MOV, 71 }, ++ { 249, 249, .raddr_mask = OP_MASK(7), V3D_QPU_A_MOV, 71 }, ++ { 249, 249, .raddr_mask = OP_MASK(11), V3D_QPU_A_MOV, 71 }, ++ { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 }, ++ { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 }, ++ + }; + + static const struct opcode_desc mul_ops_v71[] = { +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0034-broadcom-qpu-fix-packing-unpacking-of-fmov-variants-.patch b/projects/RPi/devices/RPi5/patches/mesa/0034-broadcom-qpu-fix-packing-unpacking-of-fmov-variants-.patch new file mode 100644 index 0000000000..df5222700d --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0034-broadcom-qpu-fix-packing-unpacking-of-fmov-variants-.patch @@ -0,0 +1,46 @@ +From 8dbbb7e22b694fdc62376d112b3dc6105d556c63 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Mon, 4 Oct 2021 13:07:35 +0200 +Subject: [PATCH 034/142] broadcom/qpu: fix packing/unpacking of fmov variants + for v71 + +--- + src/broadcom/qpu/qpu_pack.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c +index 0e504e65fbf..0eb820b3f10 100644 +--- a/src/broadcom/qpu/qpu_pack.c ++++ b/src/broadcom/qpu/qpu_pack.c +@@ -1405,9 +1405,9 @@ v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst + break; + + case V3D_QPU_M_FMOV: +- instr->alu.mul.output_pack = (raddr_d >> 2) & 1; ++ instr->alu.mul.output_pack = raddr_d & 0x3; + +- if (!v3d_qpu_float32_unpack_unpack(raddr_d & 0x3, ++ if (!v3d_qpu_float32_unpack_unpack((raddr_d >> 2) & 0x7, + &instr->alu.mul.a.unpack)) { + return false; + } +@@ -2046,14 +2046,13 @@ v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo, + &packed)) { + return false; + } +- opcode |= (packed >> 1) & 1; +- raddr_d = (packed & 1) << 2; ++ raddr_d |= packed; + + if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } +- raddr_d |= packed; ++ raddr_d |= packed << 2; + break; + } + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0035-broadcom-qpu-implement-switch-rules-for-fmin-fmax-fa.patch b/projects/RPi/devices/RPi5/patches/mesa/0035-broadcom-qpu-implement-switch-rules-for-fmin-fmax-fa.patch new file mode 100644 index 0000000000..2e244c13dc --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0035-broadcom-qpu-implement-switch-rules-for-fmin-fmax-fa.patch @@ -0,0 +1,107 @@ +From 63d0059ebef288afb0e2e746dadda8c2238bdfcb Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Tue, 28 Sep 2021 01:17:08 +0200 +Subject: [PATCH 035/142] broadcom/qpu: implement switch rules for fmin/fmax + fadd/faddnf for v71 + +They use the same opcodes, and switch between one and the other based +on raddr. + +Note that the rule rule includes also if small_imm_a/b are used. That +is still not in place so that part is hardcode. Would be updated later +when small immediates support for v71 gets implemented. +--- + src/broadcom/qpu/qpu_pack.c | 48 +++++++++++++++++++++++++++++++++++++ + 1 file changed, 48 insertions(+) + +diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c +index 0eb820b3f10..7a262f18ac3 100644 +--- a/src/broadcom/qpu/qpu_pack.c ++++ b/src/broadcom/qpu/qpu_pack.c +@@ -651,7 +651,9 @@ static const struct opcode_desc mul_ops_v33[] = { + * opcodes that changed on v71 + */ + static const struct opcode_desc add_ops_v71[] = { ++ /* FADD is FADDNF depending on the order of the raddr_a/raddr_b. */ + { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD }, ++ { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADDNF }, + { 53, 55, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK }, + { 56, 56, .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD }, + { 57, 59, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK }, +@@ -666,6 +668,10 @@ static const struct opcode_desc add_ops_v71[] = { + { 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR }, + { 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR }, + { 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR }, ++ /* FMIN is instead FMAX depending on the raddr_a/b order. */ ++ { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMIN }, ++ { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMAX }, ++ { 176, 180, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFMIN }, + + { 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND }, + { 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR }, +@@ -1162,6 +1168,22 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst + + instr->alu.add.op = desc->op; + ++ /* FADD/FADDNF and FMIN/FMAX are determined by the orders of the ++ * operands. ++ */ ++ /* FIXME: for now hardcoded values, until we got the small_imm support ++ * in place ++ */ ++ uint32_t small_imm_a = 0; ++ uint32_t small_imm_b = 0; ++ if (small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a > ++ small_imm_b *256 + (op & 3) * 64 + raddr_b) { ++ if (instr->alu.add.op == V3D_QPU_A_FMIN) ++ instr->alu.add.op = V3D_QPU_A_FMAX; ++ if (instr->alu.add.op == V3D_QPU_A_FADD) ++ instr->alu.add.op = V3D_QPU_A_FADDNF; ++ } ++ + /* Some QPU ops require a bit more than just basic opcode and mux a/b + * comparisons to distinguish them. + */ +@@ -1754,6 +1776,11 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo, + uint32_t output_pack; + uint32_t a_unpack; + uint32_t b_unpack; ++ /* FIXME: for now hardcoded values, until we got the small_imm ++ * support in place ++ */ ++ uint32_t small_imm_a = 0; ++ uint32_t small_imm_b = 0; + + if (instr->alu.add.op != V3D_QPU_A_FCMP) { + if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack, +@@ -1773,6 +1800,27 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo, + return false; + } + ++ /* These operations with commutative operands are ++ * distinguished by which order their operands come in. ++ */ ++ bool ordering = ++ small_imm_a * 256 + a_unpack * 64 + raddr_a > ++ small_imm_b * 256 + b_unpack * 64 + raddr_b; ++ if (((instr->alu.add.op == V3D_QPU_A_FMIN || ++ instr->alu.add.op == V3D_QPU_A_FADD) && ordering) || ++ ((instr->alu.add.op == V3D_QPU_A_FMAX || ++ instr->alu.add.op == V3D_QPU_A_FADDNF) && !ordering)) { ++ uint32_t temp; ++ ++ temp = a_unpack; ++ a_unpack = b_unpack; ++ b_unpack = temp; ++ ++ temp = raddr_a; ++ raddr_a = raddr_b; ++ raddr_b = temp; ++ } ++ + opcode |= a_unpack << 2; + opcode |= b_unpack << 0; + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0036-broadcom-compiler-make-vir_write_rX-return-false-on-.patch b/projects/RPi/devices/RPi5/patches/mesa/0036-broadcom-compiler-make-vir_write_rX-return-false-on-.patch new file mode 100644 index 0000000000..6c80d4a9ab --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0036-broadcom-compiler-make-vir_write_rX-return-false-on-.patch @@ -0,0 +1,37 @@ +From c9f6faa3ddc91024b3d9dc67ce2221187daac128 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 29 Sep 2021 11:54:18 +0200 +Subject: [PATCH 036/142] broadcom/compiler: make vir_write_rX return false on + platforms without accums + +--- + src/broadcom/compiler/vir.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c +index 007cb0a941b..d75cd777b6d 100644 +--- a/src/broadcom/compiler/vir.c ++++ b/src/broadcom/compiler/vir.c +@@ -158,6 +158,9 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst) + bool + vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst) + { ++ if (!devinfo->has_accumulators) ++ return false; ++ + for (int i = 0; i < vir_get_nsrc(inst); i++) { + switch (inst->src[i].file) { + case QFILE_VPM: +@@ -180,6 +183,9 @@ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst) + bool + vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst) + { ++ if (!devinfo->has_accumulators) ++ return false; ++ + switch (inst->dst.file) { + case QFILE_MAGIC: + switch (inst->dst.index) { +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0037-broadcom-compiler-rename-vir_writes_rX-to-vir_writes.patch b/projects/RPi/devices/RPi5/patches/mesa/0037-broadcom-compiler-rename-vir_writes_rX-to-vir_writes.patch new file mode 100644 index 0000000000..1dea74a300 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0037-broadcom-compiler-rename-vir_writes_rX-to-vir_writes.patch @@ -0,0 +1,77 @@ +From 3d16229743e26b58735ed049ee982073f6034342 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 29 Sep 2021 12:03:50 +0200 +Subject: [PATCH 037/142] broadcom/compiler: rename vir_writes_rX to + vir_writes_rX_implicitly + +Since that represents more accurately what they check.. +--- + src/broadcom/compiler/v3d_compiler.h | 4 ++-- + src/broadcom/compiler/vir.c | 6 ++++-- + src/broadcom/compiler/vir_register_allocate.c | 4 ++-- + 3 files changed, 8 insertions(+), 6 deletions(-) + +diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h +index eb4e692464b..7e8f3bfc1a7 100644 +--- a/src/broadcom/compiler/v3d_compiler.h ++++ b/src/broadcom/compiler/v3d_compiler.h +@@ -1149,8 +1149,8 @@ bool vir_is_raw_mov(struct qinst *inst); + bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst); + bool vir_is_add(struct qinst *inst); + bool vir_is_mul(struct qinst *inst); +-bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst); +-bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst); ++bool vir_writes_r3_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst); ++bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst); + struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg); + uint8_t vir_channels_written(struct qinst *inst); + struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i); +diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c +index d75cd777b6d..aea113f050e 100644 +--- a/src/broadcom/compiler/vir.c ++++ b/src/broadcom/compiler/vir.c +@@ -156,7 +156,8 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst) + } + + bool +-vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst) ++vir_writes_r3_implicitly(const struct v3d_device_info *devinfo, ++ struct qinst *inst) + { + if (!devinfo->has_accumulators) + return false; +@@ -181,7 +182,8 @@ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst) + } + + bool +-vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst) ++vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, ++ struct qinst *inst) + { + if (!devinfo->has_accumulators) + return false; +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index 7b3f6c41934..f2df35cd458 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -988,7 +988,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + * result to a temp), nothing else can be stored in r3/r4 across + * it. + */ +- if (vir_writes_r3(c->devinfo, inst)) { ++ if (vir_writes_r3_implicitly(c->devinfo, inst)) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { + ra_add_node_interference(c->g, +@@ -998,7 +998,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + } + } + +- if (vir_writes_r4(c->devinfo, inst)) { ++ if (vir_writes_r4_implicitly(c->devinfo, inst)) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { + ra_add_node_interference(c->g, +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0038-broadcom-compiler-only-handle-accumulator-classes-if.patch b/projects/RPi/devices/RPi5/patches/mesa/0038-broadcom-compiler-only-handle-accumulator-classes-if.patch new file mode 100644 index 0000000000..b39e7bda94 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0038-broadcom-compiler-only-handle-accumulator-classes-if.patch @@ -0,0 +1,170 @@ +From 83fae160491737e8568b8fb5eaa5be4d2c8bf3c8 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 29 Sep 2021 12:10:31 +0200 +Subject: [PATCH 038/142] broadcom/compiler: only handle accumulator classes if + present + +--- + src/broadcom/compiler/vir_register_allocate.c | 77 ++++++++++++------- + 1 file changed, 49 insertions(+), 28 deletions(-) + +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index f2df35cd458..e78ccb7c6aa 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -53,6 +53,17 @@ get_class_bit_any(const struct v3d_device_info *devinfo) + else + return CLASS_BITS_PHYS; + } ++ ++static uint8_t ++filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits) ++{ ++ if (!devinfo->has_accumulators) { ++ assert(class_bits & CLASS_BITS_PHYS); ++ class_bits = CLASS_BITS_PHYS; ++ } ++ return class_bits; ++} ++ + static inline uint32_t + temp_to_node(struct v3d_compile *c, uint32_t temp) + { +@@ -413,8 +424,10 @@ v3d_setup_spill_base(struct v3d_compile *c) + */ + if (c->spilling) { + int temp_class = CLASS_BITS_PHYS; +- if (i != c->spill_base.index) ++ if (c->devinfo->has_accumulators && ++ i != c->spill_base.index) { + temp_class |= CLASS_BITS_ACC; ++ } + add_node(c, i, temp_class); + } + } +@@ -473,14 +486,16 @@ v3d_emit_spill_tmua(struct v3d_compile *c, + * temp will be used immediately so just like the uniform above we + * can allow accumulators. + */ ++ int temp_class = ++ filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC); + if (!fill_dst) { + struct qreg dst = vir_TMUWT(c); + assert(dst.file == QFILE_TEMP); +- add_node(c, dst.index, CLASS_BITS_PHYS | CLASS_BITS_ACC); ++ add_node(c, dst.index, temp_class); + } else { + *fill_dst = vir_LDTMU(c); + assert(fill_dst->file == QFILE_TEMP); +- add_node(c, fill_dst->index, CLASS_BITS_PHYS | CLASS_BITS_ACC); ++ add_node(c, fill_dst->index, temp_class); + } + + /* Temps across the thread switch we injected can't be assigned to +@@ -662,8 +677,10 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) + * instruction immediately after so we + * can use ACC. + */ +- add_node(c, temp.index, CLASS_BITS_PHYS | +- CLASS_BITS_ACC); ++ int temp_class = ++ filter_class_bits(c->devinfo, CLASS_BITS_PHYS | ++ CLASS_BITS_ACC); ++ add_node(c, temp.index, temp_class); + } else { + /* If we have a postponed spill, we + * don't need a fill as the temp would +@@ -941,6 +958,7 @@ vir_init_reg_sets(struct v3d_compiler *compiler) + compiler->reg_class_phys[threads] = + ra_alloc_contig_reg_class(compiler->regs, 1); + ++ /* Init physical regs */ + for (int i = phys_index; + i < phys_index + (PHYS_COUNT >> threads); i++) { + if (compiler->devinfo->has_accumulators) +@@ -949,16 +967,15 @@ vir_init_reg_sets(struct v3d_compiler *compiler) + ra_class_add_reg(compiler->reg_class_any[threads], i); + } + ++ /* Init accumulator regs */ + if (compiler->devinfo->has_accumulators) { + for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { + ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); + ra_class_add_reg(compiler->reg_class_any[threads], i); + } +- } +- /* r5 can only store a single 32-bit value, so not much can +- * use it. +- */ +- if (compiler->devinfo->has_accumulators) { ++ /* r5 can only store a single 32-bit value, so not much can ++ * use it. ++ */ + ra_class_add_reg(compiler->reg_class_r5[threads], + ACC_INDEX + 5); + ra_class_add_reg(compiler->reg_class_any[threads], +@@ -1081,21 +1098,23 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + * because ldunif has usually a shorter lifespan, allowing for + * more accumulator reuse and QPU merges. + */ +- if (!inst->qpu.sig.ldunif) { +- uint8_t class_bits = +- get_temp_class_bits(c, inst->dst.index) & +- ~CLASS_BITS_R5; +- set_temp_class_bits(c, inst->dst.index, +- class_bits); +- +- } else { +- /* Until V3D 4.x, we could only load a uniform +- * to r5, so we'll need to spill if uniform +- * loads interfere with each other. +- */ +- if (c->devinfo->ver < 40) { ++ if (c->devinfo->has_accumulators) { ++ if (!inst->qpu.sig.ldunif) { ++ uint8_t class_bits = ++ get_temp_class_bits(c, inst->dst.index) & ++ ~CLASS_BITS_R5; + set_temp_class_bits(c, inst->dst.index, +- CLASS_BITS_R5); ++ class_bits); ++ ++ } else { ++ /* Until V3D 4.x, we could only load a uniform ++ * to r5, so we'll need to spill if uniform ++ * loads interfere with each other. ++ */ ++ if (c->devinfo->ver < 40) { ++ set_temp_class_bits(c, inst->dst.index, ++ CLASS_BITS_R5); ++ } + } + } + } +@@ -1152,8 +1171,10 @@ v3d_register_allocate(struct v3d_compile *c) + c->thread_index--; + } + +- c->g = ra_alloc_interference_graph(c->compiler->regs, +- c->num_temps + ARRAY_SIZE(acc_nodes)); ++ unsigned num_ra_nodes = c->num_temps; ++ if (c->devinfo->has_accumulators) ++ num_ra_nodes += ARRAY_SIZE(acc_nodes); ++ c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes); + ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data); + + /* Make some fixed nodes for the accumulators, which we will need to +@@ -1162,8 +1183,8 @@ v3d_register_allocate(struct v3d_compile *c) + * live in, but the classes take up a lot of memory to set up, so we + * don't want to make too many. + */ +- for (uint32_t i = 0; i < ACC_COUNT + c->num_temps; i++) { +- if (i < ACC_COUNT) { ++ for (uint32_t i = 0; i < num_ra_nodes; i++) { ++ if (c->devinfo->has_accumulators && i < ACC_COUNT) { + acc_nodes[i] = i; + ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i); + c->nodes.info[i].priority = 0; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0039-broadcom-compiler-don-t-assign-rf0-to-temps-across-i.patch b/projects/RPi/devices/RPi5/patches/mesa/0039-broadcom-compiler-don-t-assign-rf0-to-temps-across-i.patch new file mode 100644 index 0000000000..e7553a8295 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0039-broadcom-compiler-don-t-assign-rf0-to-temps-across-i.patch @@ -0,0 +1,187 @@ +From fd77cc3204e7c69927f97ce2a1d55d2a47d77a27 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 29 Sep 2021 12:14:04 +0200 +Subject: [PATCH 039/142] broadcom/compiler: don't assign rf0 to temps across + implicit rf0 writes + +In platforms that don't have accumulators and have implicit writes to +the register file we need to be careful and avoid assigning a physical +register to a temp that lives across an implicit write to that same +physical register. + +For now, we have the case of implicit writes to rf0 from various +signals, but it should be easy to extend this to include additional +registers if needed. +--- + src/broadcom/compiler/vir_register_allocate.c | 69 +++++++++++++++---- + 1 file changed, 57 insertions(+), 12 deletions(-) + +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index e78ccb7c6aa..e0adc1de7a4 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -29,6 +29,9 @@ + #define ACC_INDEX 0 + #define ACC_COUNT 6 + ++/* RA nodes used to track RF registers with implicit writes */ ++#define IMPLICIT_RF_COUNT 1 ++ + #define PHYS_COUNT 64 + + static uint8_t +@@ -67,15 +70,17 @@ filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits) + static inline uint32_t + temp_to_node(struct v3d_compile *c, uint32_t temp) + { +- return temp + (c->devinfo->has_accumulators ? ACC_COUNT : 0); ++ return temp + (c->devinfo->has_accumulators ? ACC_COUNT : ++ IMPLICIT_RF_COUNT); + } + + static inline uint32_t + node_to_temp(struct v3d_compile *c, uint32_t node) + { + assert((c->devinfo->has_accumulators && node >= ACC_COUNT) || +- (!c->devinfo->has_accumulators && node >= 0)); +- return node - (c->devinfo->has_accumulators ? ACC_COUNT : 0); ++ (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT)); ++ return node - (c->devinfo->has_accumulators ? ACC_COUNT : ++ IMPLICIT_RF_COUNT); + } + + static inline uint8_t +@@ -360,7 +365,8 @@ ensure_nodes(struct v3d_compile *c) + c->nodes.info = reralloc_array_size(c, + c->nodes.info, + sizeof(c->nodes.info[0]), +- c->nodes.alloc_count + ACC_COUNT); ++ c->nodes.alloc_count + ++ MAX2(ACC_COUNT, IMPLICIT_RF_COUNT)); + } + + /* Creates the interference node for a new temp. We use this to keep the node +@@ -372,7 +378,8 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits) + ensure_nodes(c); + + int node = ra_add_node(c->g, choose_reg_class(c, class_bits)); +- assert(node == temp + ACC_COUNT); ++ assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT : ++ node == temp + IMPLICIT_RF_COUNT); + + /* We fill the node priority after we are done inserting spills */ + c->nodes.info[node].class_bits = class_bits; +@@ -995,7 +1002,9 @@ tmu_spilling_allowed(struct v3d_compile *c) + } + + static void +-update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, ++update_graph_and_reg_classes_for_inst(struct v3d_compile *c, ++ int *acc_nodes, ++ int *implicit_rf_nodes, + struct qinst *inst) + { + int32_t ip = inst->ip; +@@ -1025,6 +1034,19 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + } + } + ++ /* If any instruction writes to a physical register implicitly ++ * nothing else can write the same register across it. ++ */ ++ if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) { ++ for (int i = 0; i < c->num_temps; i++) { ++ if (c->temp_start[i] < ip && c->temp_end[i] > ip) { ++ ra_add_node_interference(c->g, ++ temp_to_node(c, i), ++ implicit_rf_nodes[0]); ++ } ++ } ++ } ++ + if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { + switch (inst->qpu.alu.add.op) { + case V3D_QPU_A_LDVPMV_IN: +@@ -1116,6 +1138,16 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + CLASS_BITS_R5); + } + } ++ } else { ++ /* If the instruction has an implicit write ++ * we can't allocate its dest to the same ++ * register. ++ */ ++ if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) { ++ ra_add_node_interference(c->g, ++ temp_to_node(c, inst->dst.index), ++ implicit_rf_nodes[0]); ++ } + } + } + +@@ -1139,10 +1171,18 @@ struct qpu_reg * + v3d_register_allocate(struct v3d_compile *c) + { + int acc_nodes[ACC_COUNT]; ++ int implicit_rf_nodes[IMPLICIT_RF_COUNT]; ++ ++ unsigned num_ra_nodes = c->num_temps; ++ if (c->devinfo->has_accumulators) ++ num_ra_nodes += ARRAY_SIZE(acc_nodes); ++ else ++ num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes); ++ + c->nodes = (struct v3d_ra_node_info) { + .alloc_count = c->num_temps, + .info = ralloc_array_size(c, sizeof(c->nodes.info[0]), +- c->num_temps + ACC_COUNT), ++ num_ra_nodes), + }; + + uint32_t phys_index = get_phys_index(c->devinfo); +@@ -1171,9 +1211,6 @@ v3d_register_allocate(struct v3d_compile *c) + c->thread_index--; + } + +- unsigned num_ra_nodes = c->num_temps; +- if (c->devinfo->has_accumulators) +- num_ra_nodes += ARRAY_SIZE(acc_nodes); + c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes); + ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data); + +@@ -1181,7 +1218,8 @@ v3d_register_allocate(struct v3d_compile *c) + * interfere with when ops have implied r3/r4 writes or for the thread + * switches. We could represent these as classes for the nodes to + * live in, but the classes take up a lot of memory to set up, so we +- * don't want to make too many. ++ * don't want to make too many. We use the same mechanism on platforms ++ * without accumulators that can have implicit writes to phys regs. + */ + for (uint32_t i = 0; i < num_ra_nodes; i++) { + if (c->devinfo->has_accumulators && i < ACC_COUNT) { +@@ -1189,6 +1227,12 @@ v3d_register_allocate(struct v3d_compile *c) + ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i); + c->nodes.info[i].priority = 0; + c->nodes.info[i].class_bits = 0; ++ } else if (!c->devinfo->has_accumulators && ++ i < ARRAY_SIZE(implicit_rf_nodes)) { ++ implicit_rf_nodes[i] = i; ++ ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i); ++ c->nodes.info[i].priority = 0; ++ c->nodes.info[i].class_bits = 0; + } else { + uint32_t t = node_to_temp(c, i); + c->nodes.info[i].priority = +@@ -1204,7 +1248,8 @@ v3d_register_allocate(struct v3d_compile *c) + int ip = 0; + vir_for_each_inst_inorder(inst, c) { + inst->ip = ip++; +- update_graph_and_reg_classes_for_inst(c, acc_nodes, inst); ++ update_graph_and_reg_classes_for_inst(c, acc_nodes, ++ implicit_rf_nodes, inst); + } + + /* Set the register classes for all our temporaries in the graph */ +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0040-broadcom-compiler-CS-payload-registers-have-changed-.patch b/projects/RPi/devices/RPi5/patches/mesa/0040-broadcom-compiler-CS-payload-registers-have-changed-.patch new file mode 100644 index 0000000000..8eee3ac26c --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0040-broadcom-compiler-CS-payload-registers-have-changed-.patch @@ -0,0 +1,33 @@ +From 9a08ae9f354a6da6d9d71b87800aca8b3df49e29 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 28 Sep 2021 13:37:28 +0200 +Subject: [PATCH 040/142] broadcom/compiler: CS payload registers have changed + in v71 + +--- + src/broadcom/compiler/nir_to_vir.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c +index 1a05b279a2d..220ff6bcd49 100644 +--- a/src/broadcom/compiler/nir_to_vir.c ++++ b/src/broadcom/compiler/nir_to_vir.c +@@ -4362,8 +4362,13 @@ nir_to_vir(struct v3d_compile *c) + V3D_QPU_WADDR_SYNC)); + } + +- c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); +- c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); ++ if (c->devinfo->ver <= 42) { ++ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); ++ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); ++ } else if (c->devinfo->ver >= 71) { ++ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3)); ++ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); ++ } + + /* Set up the division between gl_LocalInvocationIndex and + * wg_in_mem in the payload reg. +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0041-broadcom-compiler-don-t-schedule-rf0-writes-right-af.patch b/projects/RPi/devices/RPi5/patches/mesa/0041-broadcom-compiler-don-t-schedule-rf0-writes-right-af.patch new file mode 100644 index 0000000000..193468668e --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0041-broadcom-compiler-don-t-schedule-rf0-writes-right-af.patch @@ -0,0 +1,46 @@ +From 5477884196cb54a71f54fa6cad42c6d3326bde88 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Fri, 22 Oct 2021 13:39:48 +0200 +Subject: [PATCH 041/142] broadcom/compiler: don't schedule rf0 writes right + after ldvary + +ldvary writes rf0 implicitly on the next cycle so they would clash. +This case is not handled correctly by our normal dependency tracking, +which doesn't know anything about delayed writes from instructions +and thinks the rf0 write happens on the same cycle ldvary is emitted. + +Fixes (v71): +dEQP-VK.glsl.conversions.matrix_to_matrix.mat2x3_to_mat4x2_fragment +--- + src/broadcom/compiler/qpu_schedule.c | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index 839c0c62315..870823fd2b1 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -652,6 +652,21 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo, + v3d_qpu_writes_r4(devinfo, inst)) + return true; + ++ if (devinfo->ver <= 42) ++ return false; ++ ++ /* Don't schedule anything that writes rf0 right after ldvary, since ++ * that would clash with the ldvary's delayed rf0 write (the exception ++ * is another ldvary, since its implicit rf0 write would also have ++ * one cycle of delay and would not clash). ++ */ ++ if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick && ++ (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) || ++ (v3d_qpu_writes_rf0_implicitly(devinfo, inst) && ++ !inst->sig.ldvary))) { ++ return true; ++ } ++ + return false; + } + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0042-broadcom-compiler-allow-instruction-merges-in-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0042-broadcom-compiler-allow-instruction-merges-in-v71.patch new file mode 100644 index 0000000000..dcb28320d5 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0042-broadcom-compiler-allow-instruction-merges-in-v71.patch @@ -0,0 +1,60 @@ +From 31623712c2f741d393767641f32d56c35150eda5 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Thu, 30 Sep 2021 13:22:48 +0200 +Subject: [PATCH 042/142] broadcom/compiler: allow instruction merges in v71 + +In v3d 4.x there were restrictions based on the number of raddrs used +by the combined instructions, but we don't have these restrictions in +v3d 7.x. + +It should be noted that while there are no restrictions on the number +of raddrs addressed, a QPU instruction can only address a single small +immediate, so we should be careful about that when we add support for +small immediates. +--- + src/broadcom/compiler/qpu_schedule.c | 21 +++++++++++++++++---- + 1 file changed, 17 insertions(+), 4 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index 870823fd2b1..ff544fb3c1c 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -906,8 +906,11 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a, + static bool + qpu_merge_raddrs(struct v3d_qpu_instr *result, + const struct v3d_qpu_instr *add_instr, +- const struct v3d_qpu_instr *mul_instr) ++ const struct v3d_qpu_instr *mul_instr, ++ const struct v3d_device_info *devinfo) + { ++ assert(devinfo->ver <= 42); ++ + uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr); + int naddrs = util_bitcount64(raddrs_used); + +@@ -1111,9 +1114,19 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, + add_instr = a; + } + +- if (add_instr && mul_instr && +- !qpu_merge_raddrs(&merge, add_instr, mul_instr)) { +- return false; ++ /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and ++ * they have restrictions on the number of raddrs that can be adressed ++ * in a single instruction. ++ * ++ * FIXME: for V3D 7.x we can't merge instructions if they address more ++ * than one small immediate. For now, we don't support small immediates, ++ * so it is not a problem. ++ */ ++ if (devinfo->ver <= 42) { ++ if (add_instr && mul_instr && ++ !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) { ++ return false; ++ } + } + + merge.sig.thrsw |= b->sig.thrsw; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0043-broadcom-qpu-add-MOV-integer-packing-unpacking-varia.patch b/projects/RPi/devices/RPi5/patches/mesa/0043-broadcom-qpu-add-MOV-integer-packing-unpacking-varia.patch new file mode 100644 index 0000000000..1df473d3de --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0043-broadcom-qpu-add-MOV-integer-packing-unpacking-varia.patch @@ -0,0 +1,172 @@ +From 959a0128654c94d84fda53ffc108971d3b3a817a Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 6 Oct 2021 09:27:43 +0200 +Subject: [PATCH 043/142] broadcom/qpu: add MOV integer packing/unpacking + variants + +These are new in v71 and cover MOV on both the ADD and the MUL alus. +--- + src/broadcom/qpu/qpu_instr.h | 9 ++++ + src/broadcom/qpu/qpu_pack.c | 98 ++++++++++++++++++++++++++++++++++++ + 2 files changed, 107 insertions(+) + +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index c86a4119c54..4b34d17bd4c 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -285,6 +285,15 @@ enum v3d_qpu_input_unpack { + + /** Swap high and low 16 bits */ + V3D_QPU_UNPACK_SWAP_16, ++ ++ /** Convert low 16 bits from 16-bit integer to unsigned 32-bit int */ ++ V3D_QPU_UNPACK_UL, ++ /** Convert high 16 bits from 16-bit integer to unsigned 32-bit int */ ++ V3D_QPU_UNPACK_UH, ++ /** Convert low 16 bits from 16-bit integer to signed 32-bit int */ ++ V3D_QPU_UNPACK_IL, ++ /** Convert high 16 bits from 16-bit integer to signed 32-bit int */ ++ V3D_QPU_UNPACK_IH, + }; + + enum v3d_qpu_mux { +diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c +index 7a262f18ac3..4d677894755 100644 +--- a/src/broadcom/qpu/qpu_pack.c ++++ b/src/broadcom/qpu/qpu_pack.c +@@ -922,6 +922,56 @@ v3d_qpu_float32_unpack_pack(enum v3d_qpu_input_unpack unpacked, + } + } + ++static bool ++v3d_qpu_int32_unpack_unpack(uint32_t packed, ++ enum v3d_qpu_input_unpack *unpacked) ++{ ++ switch (packed) { ++ case 0: ++ *unpacked = V3D_QPU_UNPACK_NONE; ++ return true; ++ case 1: ++ *unpacked = V3D_QPU_UNPACK_UL; ++ return true; ++ case 2: ++ *unpacked = V3D_QPU_UNPACK_UH; ++ return true; ++ case 3: ++ *unpacked = V3D_QPU_UNPACK_IL; ++ return true; ++ case 4: ++ *unpacked = V3D_QPU_UNPACK_IH; ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static bool ++v3d_qpu_int32_unpack_pack(enum v3d_qpu_input_unpack unpacked, ++ uint32_t *packed) ++{ ++ switch (unpacked) { ++ case V3D_QPU_UNPACK_NONE: ++ *packed = 0; ++ return true; ++ case V3D_QPU_UNPACK_UL: ++ *packed = 1; ++ return true; ++ case V3D_QPU_UNPACK_UH: ++ *packed = 2; ++ return true; ++ case V3D_QPU_UNPACK_IL: ++ *packed = 3; ++ return true; ++ case V3D_QPU_UNPACK_IH: ++ *packed = 4; ++ return true; ++ default: ++ return false; ++ } ++} ++ + static bool + v3d_qpu_float16_unpack_unpack(uint32_t packed, + enum v3d_qpu_input_unpack *unpacked) +@@ -1273,6 +1323,15 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst + instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + break; + ++ case V3D_QPU_A_MOV: ++ instr->alu.add.output_pack = V3D_QPU_PACK_NONE; ++ ++ if (!v3d_qpu_int32_unpack_unpack((raddr_b >> 2) & 0x7, ++ &instr->alu.add.a.unpack)) { ++ return false; ++ } ++ break; ++ + default: + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; +@@ -1449,6 +1508,15 @@ v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst + + break; + ++ case V3D_QPU_M_MOV: ++ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; ++ ++ if (!v3d_qpu_int32_unpack_unpack((raddr_d >> 2) & 0x7, ++ &instr->alu.mul.a.unpack)) { ++ return false; ++ } ++ break; ++ + default: + instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; + instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; +@@ -1909,6 +1977,21 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo, + opcode |= packed; + break; + ++ case V3D_QPU_A_MOV: { ++ uint32_t packed; ++ ++ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE) ++ return false; ++ ++ if (!v3d_qpu_int32_unpack_pack(instr->alu.add.a.unpack, ++ &packed)) { ++ return false; ++ } ++ ++ raddr_b |= packed << 2; ++ break; ++ } ++ + default: + if (instr->alu.add.op != V3D_QPU_A_NOP && + (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || +@@ -2126,6 +2209,21 @@ v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo, + break; + } + ++ case V3D_QPU_M_MOV: { ++ uint32_t packed; ++ ++ if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE) ++ return false; ++ ++ if (!v3d_qpu_int32_unpack_pack(instr->alu.mul.a.unpack, ++ &packed)) { ++ return false; ++ } ++ ++ raddr_d |= packed << 2; ++ break; ++ } ++ + default: + break; + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0044-broadcom-qpu-fail-packing-on-unhandled-mul-pack-unpa.patch b/projects/RPi/devices/RPi5/patches/mesa/0044-broadcom-qpu-fail-packing-on-unhandled-mul-pack-unpa.patch new file mode 100644 index 0000000000..864966dbea --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0044-broadcom-qpu-fail-packing-on-unhandled-mul-pack-unpa.patch @@ -0,0 +1,47 @@ +From 2e86dd0c357d7b432ce6794ae22fbfae89ad186b Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 6 Oct 2021 12:01:10 +0200 +Subject: [PATCH 044/142] broadcom/qpu: fail packing on unhandled mul + pack/unpack + +We are doing this for the ADD alu already and it may be helpful to +identify cases where we have QPU code with pack/unpack modifiers on +MUL opcodes that we then are not packing into the actual QPU +instructions. +--- + src/broadcom/qpu/qpu_pack.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c +index 4d677894755..180d7ab08a3 100644 +--- a/src/broadcom/qpu/qpu_pack.c ++++ b/src/broadcom/qpu/qpu_pack.c +@@ -2106,6 +2106,12 @@ v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo, + } + + default: ++ if (instr->alu.mul.op != V3D_QPU_M_NOP && ++ (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE || ++ instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || ++ instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) { ++ return false; ++ } + break; + } + +@@ -2225,6 +2231,12 @@ v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo, + } + + default: ++ if (instr->alu.mul.op != V3D_QPU_M_NOP && ++ (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE || ++ instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || ++ instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) { ++ return false; ++ } + break; + } + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0045-broadcom-compiler-generalize-check-for-shaders-using.patch b/projects/RPi/devices/RPi5/patches/mesa/0045-broadcom-compiler-generalize-check-for-shaders-using.patch new file mode 100644 index 0000000000..cc4befe719 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0045-broadcom-compiler-generalize-check-for-shaders-using.patch @@ -0,0 +1,30 @@ +From ed6bfa29d43b5a89ff070961454f1e82e23b4f45 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Fri, 8 Oct 2021 15:10:24 +0200 +Subject: [PATCH 045/142] broadcom/compiler: generalize check for shaders using + pixel center W + +V3D 4.x has pixel center W in rf0 and V3D 7.x has it in rf3. We already +account for this when we setup the c->payload_w, so use that. +--- + src/broadcom/compiler/nir_to_vir.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c +index 220ff6bcd49..90fe1d1e7f0 100644 +--- a/src/broadcom/compiler/nir_to_vir.c ++++ b/src/broadcom/compiler/nir_to_vir.c +@@ -4547,8 +4547,8 @@ vir_check_payload_w(struct v3d_compile *c) + + vir_for_each_inst_inorder(inst, c) { + for (int i = 0; i < vir_get_nsrc(inst); i++) { +- if (inst->src[i].file == QFILE_REG && +- inst->src[i].index == 0) { ++ if (inst->src[i].file == c->payload_w.file && ++ inst->src[i].index == c->payload_w.index) { + c->uses_center_w = true; + return; + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0046-broadcom-compiler-v71-isn-t-affected-by-double-round.patch b/projects/RPi/devices/RPi5/patches/mesa/0046-broadcom-compiler-v71-isn-t-affected-by-double-round.patch new file mode 100644 index 0000000000..23f70c60d3 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0046-broadcom-compiler-v71-isn-t-affected-by-double-round.patch @@ -0,0 +1,34 @@ +From e1a0fa2c2010ef29b8cec798cd0fc99cf44f3a2d Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Thu, 14 Oct 2021 14:16:40 +0200 +Subject: [PATCH 046/142] broadcom/compiler: v71 isn't affected by + double-rounding of viewport X,Y coords + +--- + src/broadcom/compiler/v3d_nir_lower_io.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c +index 3ef0e398228..4cdba3748a1 100644 +--- a/src/broadcom/compiler/v3d_nir_lower_io.c ++++ b/src/broadcom/compiler/v3d_nir_lower_io.c +@@ -600,9 +600,13 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, + * The correct fix for this as recommended by Broadcom + * is to convert to .8 fixed-point with ffloor(). + */ +- pos = nir_f2i32(b, nir_ffloor(b, pos)); +- v3d_nir_store_output(b, state->vp_vpm_offset + i, +- offset_reg, pos); ++ if (c->devinfo->ver <= 42) ++ pos = nir_f2i32(b, nir_ffloor(b, pos)); ++ else ++ pos = nir_f2i32(b, nir_fround_even(b, pos)); ++ ++ v3d_nir_store_output(b, state->vp_vpm_offset + i, ++ offset_reg, pos); + } + } + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0047-broadcom-compiler-update-one-TMUWT-restriction-for-v.patch b/projects/RPi/devices/RPi5/patches/mesa/0047-broadcom-compiler-update-one-TMUWT-restriction-for-v.patch new file mode 100644 index 0000000000..45dd5fba46 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0047-broadcom-compiler-update-one-TMUWT-restriction-for-v.patch @@ -0,0 +1,31 @@ +From 697e6cf01b781b244404872f331a778b6d4e67da Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Tue, 19 Oct 2021 11:16:43 +0200 +Subject: [PATCH 047/142] broadcom/compiler: update one TMUWT restriction for + v71 + +TMUWT not allowed in the final instruction restriction doesn't apply +for v71. +--- + src/broadcom/compiler/qpu_schedule.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index ff544fb3c1c..25f79aa6f46 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -1700,8 +1700,10 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, + + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { + /* GFXH-1625: TMUWT not allowed in the final instruction. */ +- if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) ++ if (c->devinfo->ver <= 42 && slot == 2 && ++ inst->alu.add.op == V3D_QPU_A_TMUWT) { + return false; ++ } + + /* No writing physical registers at the end. */ + bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0048-broadcom-compiler-update-ldunif-ldvary-comment-for-v.patch b/projects/RPi/devices/RPi5/patches/mesa/0048-broadcom-compiler-update-ldunif-ldvary-comment-for-v.patch new file mode 100644 index 0000000000..75d16def81 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0048-broadcom-compiler-update-ldunif-ldvary-comment-for-v.patch @@ -0,0 +1,37 @@ +From 26fea727a9f34b75a3fe3f6a806accaddcc317f6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Tue, 19 Oct 2021 11:51:32 +0200 +Subject: [PATCH 048/142] broadcom/compiler: update ldunif/ldvary comment for + v71 + +For v42 and below ldunif/ldvary write both on r5, but with a different +delay, so we need to take that into account when scheduling both. + +For v71 the register used is rf0, but the behaviour is the same. So +the scheduling code can be the same, but the comment needs update. +--- + src/broadcom/compiler/qpu_schedule.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index 25f79aa6f46..e8197661f89 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -1234,10 +1234,11 @@ retry: + if (pixel_scoreboard_too_soon(c, scoreboard, inst)) + continue; + +- /* ldunif and ldvary both write r5, but ldunif does so a tick +- * sooner. If the ldvary's r5 wasn't used, then ldunif might ++ /* ldunif and ldvary both write the same register (r5 for v42 ++ * and below, rf0 for v71), but ldunif does so a tick sooner. ++ * If the ldvary's register wasn't used, then ldunif might + * otherwise get scheduled so ldunif and ldvary try to update +- * r5 in the same tick. ++ * the register in the same tick. + */ + if ((inst->sig.ldunif || inst->sig.ldunifa) && + scoreboard->tick == scoreboard->last_ldvary_tick + 1) { +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0049-broadcom-compiler-update-payload-registers-handling-.patch b/projects/RPi/devices/RPi5/patches/mesa/0049-broadcom-compiler-update-payload-registers-handling-.patch new file mode 100644 index 0000000000..b66dc181f4 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0049-broadcom-compiler-update-payload-registers-handling-.patch @@ -0,0 +1,52 @@ +From 70456e27b039174f767010f96d9b649e5e42d84f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Tue, 19 Oct 2021 23:52:30 +0200 +Subject: [PATCH 049/142] broadcom/compiler: update payload registers handling + when computing live intervals + +As for v71 the payload registers are not the same. Specifically now +rf3 is used as payload register, so this is needed to avoid rf3 being +selected as a instruction dst by the register allocator, overwriting +the payload value that could be still used. +--- + src/broadcom/compiler/vir_live_variables.c | 21 +++++++++++++-------- + 1 file changed, 13 insertions(+), 8 deletions(-) + +diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c +index 575b0481dc8..87a7e2b5b81 100644 +--- a/src/broadcom/compiler/vir_live_variables.c ++++ b/src/broadcom/compiler/vir_live_variables.c +@@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c) + flags_inst = NULL; + } + +- /* Payload registers: r0/1/2 contain W, centroid W, +- * and Z at program start. Register allocation will +- * force their nodes to R0/1/2. ++ /* Payload registers: for fragment shaders, W, ++ * centroid W, and Z will be initialized at r0/1/2 ++ * until v42, or r1/r2/r3 from v71. ++ * ++ * For compute shaders, payload would be r0/r2 until ++ * v42, r3/r2 from v71 ++ * ++ * Register allocation will force their nodes to those ++ * registers. + */ + if (inst->src[0].file == QFILE_REG) { +- switch (inst->src[0].index) { +- case 0: +- case 1: +- case 2: ++ uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0; ++ uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2; ++ if (inst->src[0].index >= min_payload_r || ++ inst->src[0].index <= max_payload_r) { + c->temp_start[inst->dst.index] = 0; +- break; + } + } + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0050-broadcom-compiler-update-peripheral-access-restricti.patch b/projects/RPi/devices/RPi5/patches/mesa/0050-broadcom-compiler-update-peripheral-access-restricti.patch new file mode 100644 index 0000000000..28e2ba2dec --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0050-broadcom-compiler-update-peripheral-access-restricti.patch @@ -0,0 +1,235 @@ +From f9a76b3a1e316e5ed6387819b87eaaf60f989a2b Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 26 Oct 2021 11:43:02 +0200 +Subject: [PATCH 050/142] broadcom/compiler: update peripheral access + restrictions for v71 + +In V3D 4.x only a couple of simultaneous accesses where allowed, but +V3D 7.x is a bit more flexible, so rather than trying to check for all +the allowed combinations it is easier to check if we are one of the +disallows. + +Shader-db (pi5): + +total instructions in shared programs: 11338883 -> 11307386 (-0.28%) +instructions in affected programs: 2727201 -> 2695704 (-1.15%) +helped: 12555 +HURT: 289 +Instructions are helped. + +total max-temps in shared programs: 2230199 -> 2229260 (-0.04%) +max-temps in affected programs: 20508 -> 19569 (-4.58%) +helped: 608 +HURT: 4 +Max-temps are helped. + +total sfu-stalls in shared programs: 15236 -> 15293 (0.37%) +sfu-stalls in affected programs: 148 -> 205 (38.51%) +helped: 38 +HURT: 64 +Inconclusive result (%-change mean confidence interval includes 0). + +total inst-and-stalls in shared programs: 11354119 -> 11322679 (-0.28%) +inst-and-stalls in affected programs: 2732262 -> 2700822 (-1.15%) +helped: 12550 +HURT: 304 +Inst-and-stalls are helped. + +total nops in shared programs: 273711 -> 274095 (0.14%) +nops in affected programs: 9626 -> 10010 (3.99%) +helped: 186 +HURT: 397 +Nops are HURT. +--- + src/broadcom/compiler/qpu_schedule.c | 88 +++++++++++++++++++++------- + src/broadcom/compiler/qpu_validate.c | 2 +- + src/broadcom/qpu/qpu_instr.c | 16 +++-- + src/broadcom/qpu/qpu_instr.h | 2 + + 4 files changed, 82 insertions(+), 26 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index e8197661f89..adb501e85ce 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -790,7 +790,8 @@ enum { + V3D_PERIPHERAL_TMU_WAIT = (1 << 6), + V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7), + V3D_PERIPHERAL_TSY = (1 << 8), +- V3D_PERIPHERAL_TLB = (1 << 9), ++ V3D_PERIPHERAL_TLB_READ = (1 << 9), ++ V3D_PERIPHERAL_TLB_WRITE = (1 << 10), + }; + + static uint32_t +@@ -815,8 +816,10 @@ qpu_peripherals(const struct v3d_device_info *devinfo, + if (v3d_qpu_uses_sfu(inst)) + result |= V3D_PERIPHERAL_SFU; + +- if (v3d_qpu_uses_tlb(inst)) +- result |= V3D_PERIPHERAL_TLB; ++ if (v3d_qpu_reads_tlb(inst)) ++ result |= V3D_PERIPHERAL_TLB_READ; ++ if (v3d_qpu_writes_tlb(inst)) ++ result |= V3D_PERIPHERAL_TLB_WRITE; + + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { + if (inst->alu.add.op != V3D_QPU_A_NOP && +@@ -847,32 +850,75 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, + if (devinfo->ver < 41) + return false; + +- /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than +- * tmuc). ++ /* V3D 4.x can't do more than one peripheral access except in a ++ * few cases: + */ +- if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && +- b_peripherals == V3D_PERIPHERAL_TMU_WRITE) { +- return v3d_qpu_writes_tmu_not_tmuc(devinfo, b); ++ if (devinfo->ver <= 42) { ++ /* WRTMUC signal with TMU register write (other than tmuc). */ ++ if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && ++ b_peripherals == V3D_PERIPHERAL_TMU_WRITE) { ++ return v3d_qpu_writes_tmu_not_tmuc(devinfo, b); ++ } ++ if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && ++ a_peripherals == V3D_PERIPHERAL_TMU_WRITE) { ++ return v3d_qpu_writes_tmu_not_tmuc(devinfo, a); ++ } ++ ++ /* TMU read with VPM read/write. */ ++ if (a_peripherals == V3D_PERIPHERAL_TMU_READ && ++ (b_peripherals == V3D_PERIPHERAL_VPM_READ || ++ b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { ++ return true; ++ } ++ if (b_peripherals == V3D_PERIPHERAL_TMU_READ && ++ (a_peripherals == V3D_PERIPHERAL_VPM_READ || ++ a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { ++ return true; ++ } ++ ++ return false; + } + +- if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE && +- b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) { +- return v3d_qpu_writes_tmu_not_tmuc(devinfo, a); ++ /* V3D 7.x can't have more than one of these restricted peripherals */ ++ const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE | ++ V3D_PERIPHERAL_TMU_WRTMUC_SIG | ++ V3D_PERIPHERAL_TSY | ++ V3D_PERIPHERAL_TLB_READ | ++ V3D_PERIPHERAL_SFU | ++ V3D_PERIPHERAL_VPM_READ | ++ V3D_PERIPHERAL_VPM_WRITE; ++ ++ const uint32_t a_restricted = a_peripherals & restricted; ++ const uint32_t b_restricted = b_peripherals & restricted; ++ if (a_restricted && b_restricted) { ++ /* WRTMUC signal with TMU register write (other than tmuc) is ++ * allowed though. ++ */ ++ if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG && ++ b_restricted == V3D_PERIPHERAL_TMU_WRITE && ++ v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) || ++ (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG && ++ a_restricted == V3D_PERIPHERAL_TMU_WRITE && ++ v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) { ++ return false; ++ } + } + +- /* V3D 4.1+ allows TMU read with VPM read/write. */ +- if (a_peripherals == V3D_PERIPHERAL_TMU_READ && +- (b_peripherals == V3D_PERIPHERAL_VPM_READ || +- b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { +- return true; ++ /* Only one TMU read per instruction */ ++ if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) && ++ (b_peripherals & V3D_PERIPHERAL_TMU_READ)) { ++ return false; + } +- if (b_peripherals == V3D_PERIPHERAL_TMU_READ && +- (a_peripherals == V3D_PERIPHERAL_VPM_READ || +- a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { +- return true; ++ ++ /* Only one TLB access per instruction */ ++ if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE | ++ V3D_PERIPHERAL_TLB_READ)) && ++ (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE | ++ V3D_PERIPHERAL_TLB_READ))) { ++ return false; + } + +- return false; ++ return true; + } + + /* Compute a bitmask of which rf registers are used between +diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c +index 12788692432..fde6695d59b 100644 +--- a/src/broadcom/compiler/qpu_validate.c ++++ b/src/broadcom/compiler/qpu_validate.c +@@ -227,7 +227,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) + vpm_writes + + tlb_writes + + tsy_writes + +- inst->sig.ldtmu + ++ (devinfo->ver <= 42 ? inst->sig.ldtmu : 0) + + inst->sig.ldtlb + + inst->sig.ldvpm + + inst->sig.ldtlbu > 1) { +diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c +index 195a0dcd232..f54ce7210fb 100644 +--- a/src/broadcom/qpu/qpu_instr.c ++++ b/src/broadcom/qpu/qpu_instr.c +@@ -649,12 +649,14 @@ v3d_qpu_add_op_writes_vpm(enum v3d_qpu_add_op op) + } + + bool +-v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ++v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) + { +- if (inst->sig.ldtlb || +- inst->sig.ldtlbu) +- return true; ++ return inst->sig.ldtlb || inst->sig.ldtlbu; ++} + ++bool ++v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ++{ + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { + if (inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && +@@ -672,6 +674,12 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) + return false; + } + ++bool ++v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ++{ ++ return v3d_qpu_writes_tlb(inst) || v3d_qpu_reads_tlb(inst); ++} ++ + bool + v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) + { +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index 4b34d17bd4c..dece45c5c54 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -472,6 +472,8 @@ bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; + bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; + bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; + bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; ++bool v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; ++bool v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + bool v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0051-broadcom-qpu-add-packing-for-fmov-on-ADD-alu.patch b/projects/RPi/devices/RPi5/patches/mesa/0051-broadcom-qpu-add-packing-for-fmov-on-ADD-alu.patch new file mode 100644 index 0000000000..0002304bd8 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0051-broadcom-qpu-add-packing-for-fmov-on-ADD-alu.patch @@ -0,0 +1,61 @@ +From 3520cceb87fb2f9765ba7dbe2771fbd0cadca78d Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 26 Oct 2021 08:37:54 +0200 +Subject: [PATCH 051/142] broadcom/qpu: add packing for fmov on ADD alu + +--- + src/broadcom/qpu/qpu_pack.c | 31 +++++++++++++++++++++++++++++++ + 1 file changed, 31 insertions(+) + +diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c +index 180d7ab08a3..ed5a8bc667d 100644 +--- a/src/broadcom/qpu/qpu_pack.c ++++ b/src/broadcom/qpu/qpu_pack.c +@@ -1332,6 +1332,20 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst + } + break; + ++ case V3D_QPU_A_FMOV: ++ instr->alu.add.output_pack = raddr_b & 0x3; ++ ++ /* Mul alu FMOV has one additional variant */ ++ int32_t unpack = (raddr_b >> 2) & 0x7; ++ if (unpack == 7) ++ return false; ++ ++ if (!v3d_qpu_float32_unpack_unpack(unpack, ++ &instr->alu.add.a.unpack)) { ++ return false; ++ } ++ break; ++ + default: + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; +@@ -1992,6 +2006,23 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo, + break; + } + ++ case V3D_QPU_A_FMOV: { ++ uint32_t packed; ++ ++ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack, ++ &packed)) { ++ return false; ++ } ++ raddr_b = packed; ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, ++ &packed)) { ++ return false; ++ } ++ raddr_b |= packed << 2; ++ break; ++ } ++ + default: + if (instr->alu.add.op != V3D_QPU_A_NOP && + (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0052-broadcom-compiler-handle-rf0-flops-storage-restricti.patch b/projects/RPi/devices/RPi5/patches/mesa/0052-broadcom-compiler-handle-rf0-flops-storage-restricti.patch new file mode 100644 index 0000000000..f173a0f4c0 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0052-broadcom-compiler-handle-rf0-flops-storage-restricti.patch @@ -0,0 +1,155 @@ +From 7c7ab15b3c9def4bc3bb5be492228a933c325f8a Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 6 Oct 2021 13:58:27 +0200 +Subject: [PATCH 052/142] broadcom/compiler: handle rf0 flops storage + restriction in v71 + +--- + src/broadcom/compiler/qpu_schedule.c | 81 +++++++++++++++++++++++++++- + 1 file changed, 79 insertions(+), 2 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index adb501e85ce..7048d9257b6 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -538,6 +538,10 @@ struct choose_scoreboard { + int ldvary_count; + int pending_ldtmu_count; + bool first_ldtmu_after_thrsw; ++ ++ /* V3D 7.x */ ++ int last_implicit_rf0_write_tick; ++ bool has_rf0_flops_conflict; + }; + + static bool +@@ -1499,6 +1503,62 @@ update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard, + } + } + ++static void ++set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard, ++ const struct v3d_qpu_instr *inst, ++ const struct v3d_device_info *devinfo) ++{ ++ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick && ++ v3d_qpu_sig_writes_address(devinfo, &inst->sig) && ++ !inst->sig_magic) { ++ scoreboard->has_rf0_flops_conflict = true; ++ } ++} ++ ++static void ++update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard, ++ const struct v3d_qpu_instr *inst, ++ const struct v3d_device_info *devinfo) ++{ ++ if (devinfo->ver < 71) ++ return; ++ ++ /* Thread switch restrictions: ++ * ++ * At the point of a thread switch or thread end (when the actual ++ * thread switch or thread end happens, not when the signalling ++ * instruction is processed): ++ * ++ * - If the most recent write to rf0 was from a ldunif, ldunifa, or ++ * ldvary instruction in which another signal also wrote to the ++ * register file, and the final instruction of the thread section ++ * contained a signal which wrote to the register file, then the ++ * value of rf0 is undefined at the start of the new section ++ * ++ * Here we use the scoreboard to track if our last rf0 implicit write ++ * happens at the same time that another signal writes the register ++ * file (has_rf0_flops_conflict). We will use that information when ++ * scheduling thrsw instructions to avoid putting anything in their ++ * last delay slot which has a signal that writes to the register file. ++ */ ++ ++ /* Reset tracking if we have an explicit rf0 write or we are starting ++ * a new thread section. ++ */ ++ if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) || ++ scoreboard->tick - scoreboard->last_thrsw_tick == 3) { ++ scoreboard->last_implicit_rf0_write_tick = -10; ++ scoreboard->has_rf0_flops_conflict = false; ++ } ++ ++ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) { ++ scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ? ++ scoreboard->tick + 1 : scoreboard->tick; ++ } ++ ++ set_has_rf0_flops_conflict(scoreboard, inst, devinfo); ++} ++ + static void + update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, + const struct qinst *qinst, +@@ -1542,6 +1602,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, + if (inst->sig.ldvary) + scoreboard->last_ldvary_tick = scoreboard->tick; + ++ update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo); ++ + update_scoreboard_tmu_tracking(scoreboard, qinst); + } + +@@ -1812,6 +1874,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, + */ + static bool + qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, ++ struct choose_scoreboard *scoreboard, + const struct qinst *qinst, + uint32_t slot) + { +@@ -1842,6 +1905,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, + if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu)) + return false; + ++ /* See comment when we set has_rf0_flops_conflict for details */ ++ if (c->devinfo->ver >= 71 && ++ slot == 2 && ++ v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) && ++ !qinst->qpu.sig_magic) { ++ if (scoreboard->has_rf0_flops_conflict) ++ return false; ++ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick) ++ return false; ++ } ++ + return true; + } + +@@ -1874,7 +1948,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, + * also apply to instructions scheduled after the thrsw that we want + * to place in its delay slots. + */ +- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) ++ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot)) + return false; + + /* TLB access is disallowed until scoreboard wait is executed, which +@@ -1947,8 +2021,10 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard + bool is_thrend) + { + for (int slot = 0; slot < instructions_in_sequence; slot++) { +- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) ++ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, ++ qinst, slot)) { + return false; ++ } + + if (is_thrend && + !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) { +@@ -2718,6 +2794,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c) + scoreboard.last_setmsf_tick = -10; + scoreboard.last_stallable_sfu_tick = -10; + scoreboard.first_ldtmu_after_thrsw = true; ++ scoreboard.last_implicit_rf0_write_tick = - 10; + + if (debug) { + fprintf(stderr, "Pre-schedule instructions\n"); +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0053-broadcom-compiler-enable-ldvary-pipelining-on-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0053-broadcom-compiler-enable-ldvary-pipelining-on-v71.patch new file mode 100644 index 0000000000..ffd2489d53 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0053-broadcom-compiler-enable-ldvary-pipelining-on-v71.patch @@ -0,0 +1,189 @@ +From 0c6910721eb50b38b3388c2d2344b6ecfe0fee58 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 27 Oct 2021 11:35:12 +0200 +Subject: [PATCH 053/142] broadcom/compiler: enable ldvary pipelining on v71 + +--- + src/broadcom/compiler/qpu_schedule.c | 121 ++++++++++++++++++--------- + 1 file changed, 80 insertions(+), 41 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index 7048d9257b6..334ffdc6d58 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -2312,46 +2312,72 @@ emit_branch(struct v3d_compile *c, + } + + static bool +-alu_reads_register(struct v3d_qpu_instr *inst, ++alu_reads_register(const struct v3d_device_info *devinfo, ++ struct v3d_qpu_instr *inst, + bool add, bool magic, uint32_t index) + { + uint32_t num_src; +- enum v3d_qpu_mux mux_a, mux_b; +- +- if (add) { ++ if (add) + num_src = v3d_qpu_add_op_num_src(inst->alu.add.op); +- mux_a = inst->alu.add.a.mux; +- mux_b = inst->alu.add.b.mux; +- } else { ++ else + num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op); +- mux_a = inst->alu.mul.a.mux; +- mux_b = inst->alu.mul.b.mux; +- } + +- for (int i = 0; i < num_src; i++) { +- if (magic) { +- if (i == 0 && mux_a == index) +- return true; +- if (i == 1 && mux_b == index) +- return true; ++ if (devinfo->ver <= 42) { ++ enum v3d_qpu_mux mux_a, mux_b; ++ if (add) { ++ mux_a = inst->alu.add.a.mux; ++ mux_b = inst->alu.add.b.mux; + } else { +- if (i == 0 && mux_a == V3D_QPU_MUX_A && +- inst->raddr_a == index) { +- return true; +- } +- if (i == 0 && mux_a == V3D_QPU_MUX_B && +- inst->raddr_b == index) { +- return true; +- } +- if (i == 1 && mux_b == V3D_QPU_MUX_A && +- inst->raddr_a == index) { +- return true; +- } +- if (i == 1 && mux_b == V3D_QPU_MUX_B && +- inst->raddr_b == index) { +- return true; ++ mux_a = inst->alu.mul.a.mux; ++ mux_b = inst->alu.mul.b.mux; ++ } ++ ++ for (int i = 0; i < num_src; i++) { ++ if (magic) { ++ if (i == 0 && mux_a == index) ++ return true; ++ if (i == 1 && mux_b == index) ++ return true; ++ } else { ++ if (i == 0 && mux_a == V3D_QPU_MUX_A && ++ inst->raddr_a == index) { ++ return true; ++ } ++ if (i == 0 && mux_a == V3D_QPU_MUX_B && ++ inst->raddr_b == index) { ++ return true; ++ } ++ if (i == 1 && mux_b == V3D_QPU_MUX_A && ++ inst->raddr_a == index) { ++ return true; ++ } ++ if (i == 1 && mux_b == V3D_QPU_MUX_B && ++ inst->raddr_b == index) { ++ return true; ++ } + } + } ++ ++ return false; ++ } ++ ++ assert(devinfo->ver >= 71); ++ assert(!magic); ++ ++ uint32_t raddr_a, raddr_b; ++ if (add) { ++ raddr_a = inst->alu.add.a.raddr; ++ raddr_b = inst->alu.add.b.raddr; ++ } else { ++ raddr_a = inst->alu.mul.a.raddr; ++ raddr_b = inst->alu.mul.b.raddr; ++ } ++ ++ for (int i = 0; i < num_src; i++) { ++ if (i == 0 && raddr_a == index) ++ return true; ++ if (i == 1 && raddr_b == index) ++ return true; + } + + return false; +@@ -2386,6 +2412,8 @@ fixup_pipelined_ldvary(struct v3d_compile *c, + struct qblock *block, + struct v3d_qpu_instr *inst) + { ++ const struct v3d_device_info *devinfo = c->devinfo; ++ + /* We only call this if we have successfully merged an ldvary into a + * previous instruction. + */ +@@ -2398,9 +2426,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c, + * the ldvary destination, if it does, then moving the ldvary before + * it would overwrite it. + */ +- if (alu_reads_register(inst, true, ldvary_magic, ldvary_index)) ++ if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index)) + return false; +- if (alu_reads_register(inst, false, ldvary_magic, ldvary_index)) ++ if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index)) + return false; + + /* The implicit ldvary destination may not be written to by a signal +@@ -2436,13 +2464,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c, + } + + /* The previous instruction cannot have a conflicting signal */ +- if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig)) ++ if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig)) + return false; + + uint32_t sig; + struct v3d_qpu_sig new_sig = prev->qpu.sig; + new_sig.ldvary = true; +- if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig)) ++ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig)) + return false; + + /* The previous instruction cannot use flags since ldvary uses the +@@ -2471,14 +2499,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c, + inst->sig_magic = false; + inst->sig_addr = 0; + +- /* By moving ldvary to the previous instruction we make it update +- * r5 in the current one, so nothing else in it should write r5. +- * This should've been prevented by our dependency tracking, which ++ /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */ ++ if (devinfo->ver >= 71) { ++ scoreboard->last_implicit_rf0_write_tick = scoreboard->tick; ++ set_has_rf0_flops_conflict(scoreboard, inst, devinfo); ++ } ++ ++ /* By moving ldvary to the previous instruction we make it update r5 ++ * (rf0 for ver >= 71) in the current one, so nothing else in it ++ * should write this register. ++ * ++ * This should've been prevented by our depedency tracking, which + * would not allow ldvary to be paired up with an instruction that +- * writes r5 (since our dependency tracking doesn't know that the +- * ldvary write r5 happens in the next instruction). ++ * writes r5/rf0 (since our dependency tracking doesn't know that the ++ * ldvary write to r5/rf0 happens in the next instruction). + */ +- assert(!v3d_qpu_writes_r5(c->devinfo, inst)); ++ assert(!v3d_qpu_writes_r5(devinfo, inst)); ++ assert(devinfo->ver <= 42 || ++ (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) && ++ !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0))); + + return true; + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0054-broadcom-compiler-try-to-use-ldunif-a-instead-of-ldu.patch b/projects/RPi/devices/RPi5/patches/mesa/0054-broadcom-compiler-try-to-use-ldunif-a-instead-of-ldu.patch new file mode 100644 index 0000000000..5e4dc3adce --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0054-broadcom-compiler-try-to-use-ldunif-a-instead-of-ldu.patch @@ -0,0 +1,144 @@ +From 0670d642bb91fc68ce73f2d9fb88c482295a446d Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Thu, 28 Oct 2021 14:13:29 +0200 +Subject: [PATCH 054/142] broadcom/compiler: try to use ldunif(a) instead of + ldunif(a)rf in v71 + +The rf variants need to encode the destination in the cond bits, which +prevents these to be merged with any other instruction that need them. + +In 4.x, ldunif(a) write to r5 which is a special register that only +ldunif(a) and ldvary can write so we have a special register class for +it and only allow it for them. Then when we need to choose a register +for a node, if this register is available we always use it. + +In 7.x these instructions write to rf0, which can be used by any +instruction, so instead of restricting rf0, we track the temps that +are used as ldunif(a) destinations and use that information to favor +rf0 for them. +--- + src/broadcom/compiler/v3d_compiler.h | 3 ++ + src/broadcom/compiler/vir_register_allocate.c | 34 ++++++++++++++++--- + src/broadcom/compiler/vir_to_qpu.c | 11 ++++-- + 3 files changed, 41 insertions(+), 7 deletions(-) + +diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h +index 7e8f3bfc1a7..36adf8830b5 100644 +--- a/src/broadcom/compiler/v3d_compiler.h ++++ b/src/broadcom/compiler/v3d_compiler.h +@@ -613,6 +613,9 @@ struct v3d_ra_node_info { + struct { + uint32_t priority; + uint8_t class_bits; ++ ++ /* V3D 7.x */ ++ bool is_ldunif_dst; + } *info; + uint32_t alloc_count; + }; +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index e0adc1de7a4..1be091f8518 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -384,6 +384,7 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits) + /* We fill the node priority after we are done inserting spills */ + c->nodes.info[node].class_bits = class_bits; + c->nodes.info[node].priority = 0; ++ c->nodes.info[node].is_ldunif_dst = false; + } + + /* The spill offset for this thread takes a bit of setup, so do it once at +@@ -899,9 +900,22 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra, + + static bool + v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, ++ unsigned int node, + BITSET_WORD *regs, + unsigned int *out) + { ++ /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst ++ * so we can avoid turning them into ldunifrf (which uses the ++ * cond field to encode the dst and would prevent merge with ++ * instructions that use cond flags). ++ */ ++ if (v3d_ra->nodes->info[node].is_ldunif_dst && ++ BITSET_TEST(regs, v3d_ra->phys_index)) { ++ assert(v3d_ra->devinfo->ver >= 71); ++ *out = v3d_ra->phys_index; ++ return true; ++ } ++ + for (int i = 0; i < PHYS_COUNT; i++) { + int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT; + int phys = v3d_ra->phys_index + phys_off; +@@ -927,7 +941,7 @@ v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data) + return reg; + } + +- if (v3d_ra_select_rf(v3d_ra, regs, ®)) ++ if (v3d_ra_select_rf(v3d_ra, n, regs, ®)) + return reg; + + /* If we ran out of physical registers try to assign an accumulator +@@ -1139,15 +1153,24 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, + } + } + } else { +- /* If the instruction has an implicit write +- * we can't allocate its dest to the same +- * register. ++ /* Make sure we don't allocate the ldvary's ++ * destination to rf0, since it would clash ++ * with its implicit write to that register. + */ +- if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) { ++ if (inst->qpu.sig.ldvary) { + ra_add_node_interference(c->g, + temp_to_node(c, inst->dst.index), + implicit_rf_nodes[0]); + } ++ /* Flag dst temps from ldunif(a) instructions ++ * so we can try to assign rf0 to them and avoid ++ * converting these to ldunif(a)rf. ++ */ ++ if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) { ++ const uint32_t dst_n = ++ temp_to_node(c, inst->dst.index); ++ c->nodes.info[dst_n].is_ldunif_dst = true; ++ } + } + } + +@@ -1222,6 +1245,7 @@ v3d_register_allocate(struct v3d_compile *c) + * without accumulators that can have implicit writes to phys regs. + */ + for (uint32_t i = 0; i < num_ra_nodes; i++) { ++ c->nodes.info[i].is_ldunif_dst = false; + if (c->devinfo->has_accumulators && i < ACC_COUNT) { + acc_nodes[i] = i; + ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i); +diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c +index afc4941fdb1..cbbb495592b 100644 +--- a/src/broadcom/compiler/vir_to_qpu.c ++++ b/src/broadcom/compiler/vir_to_qpu.c +@@ -345,8 +345,15 @@ v3d_generate_code_block(struct v3d_compile *c, + assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP); + assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); + +- if (!dst.magic || +- dst.index != V3D_QPU_WADDR_R5) { ++ bool use_rf; ++ if (c->devinfo->has_accumulators) { ++ use_rf = !dst.magic || ++ dst.index != V3D_QPU_WADDR_R5; ++ } else { ++ use_rf = dst.magic || dst.index != 0; ++ } ++ ++ if (use_rf) { + assert(c->devinfo->ver >= 40); + + if (qinst->qpu.sig.ldunif) { +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0055-broadcom-compiler-don-t-assign-rf0-to-temps-that-con.patch b/projects/RPi/devices/RPi5/patches/mesa/0055-broadcom-compiler-don-t-assign-rf0-to-temps-that-con.patch new file mode 100644 index 0000000000..d03707a3fc --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0055-broadcom-compiler-don-t-assign-rf0-to-temps-that-con.patch @@ -0,0 +1,82 @@ +From cbed3b97394da09c9ae644c79e098e3ba8b5c3e8 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Fri, 29 Oct 2021 13:00:56 +0200 +Subject: [PATCH 055/142] broadcom/compiler: don't assign rf0 to temps that + conflict with ldvary + +ldvary writes to rf0 implicitly, so we don't want to allocate rf0 to +any temps that are live across ldvary's rf0 live ranges. +--- + src/broadcom/compiler/vir_register_allocate.c | 39 ++++++++++++++++++- + 1 file changed, 38 insertions(+), 1 deletion(-) + +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index 1be091f8518..6f7b1ca0589 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -1019,6 +1019,7 @@ static void + update_graph_and_reg_classes_for_inst(struct v3d_compile *c, + int *acc_nodes, + int *implicit_rf_nodes, ++ int last_ldvary_ip, + struct qinst *inst) + { + int32_t ip = inst->ip; +@@ -1125,6 +1126,25 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, + } + } + ++ /* Don't allocate rf0 to temps that cross ranges where we have ++ * live implicit rf0 writes from ldvary. We can identify these ++ * by tracking the last ldvary instruction and explicit reads ++ * of rf0. ++ */ ++ if (c->devinfo->ver >= 71 && ++ ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) || ++ (vir_get_nsrc(inst) > 1 && ++ inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) { ++ for (int i = 0; i < c->num_temps; i++) { ++ if (c->temp_start[i] < ip && ++ c->temp_end[i] > last_ldvary_ip) { ++ ra_add_node_interference(c->g, ++ temp_to_node(c, i), ++ implicit_rf_nodes[0]); ++ } ++ } ++ } ++ + if (inst->dst.file == QFILE_TEMP) { + /* Only a ldunif gets to write to R5, which only has a + * single 32-bit channel of storage. +@@ -1270,10 +1290,27 @@ v3d_register_allocate(struct v3d_compile *c) + * interferences. + */ + int ip = 0; ++ int last_ldvary_ip = -1; + vir_for_each_inst_inorder(inst, c) { + inst->ip = ip++; ++ ++ /* ldunif(a) always write to a temporary, so we have ++ * liveness info available to decide if rf0 is ++ * available for them, however, ldvary is different: ++ * it always writes to rf0 directly so we don't have ++ * liveness information for its implicit rf0 write. ++ * ++ * That means the allocator may assign rf0 to a temp ++ * that is defined while an implicit rf0 write from ++ * ldvary is still live. We fix that by manually ++ * tracking rf0 live ranges from ldvary instructions. ++ */ ++ if (inst->qpu.sig.ldvary) ++ last_ldvary_ip = ip; ++ + update_graph_and_reg_classes_for_inst(c, acc_nodes, +- implicit_rf_nodes, inst); ++ implicit_rf_nodes, ++ last_ldvary_ip, inst); + } + + /* Set the register classes for all our temporaries in the graph */ +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0056-broadcom-compiler-convert-mul-to-add-when-needed-to-.patch b/projects/RPi/devices/RPi5/patches/mesa/0056-broadcom-compiler-convert-mul-to-add-when-needed-to-.patch new file mode 100644 index 0000000000..dac7b03bfc --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0056-broadcom-compiler-convert-mul-to-add-when-needed-to-.patch @@ -0,0 +1,139 @@ +From cbaa469c09974c1574b16f559173694904fe1bb0 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Mon, 25 Oct 2021 09:38:57 +0200 +Subject: [PATCH 056/142] broadcom/compiler: convert mul to add when needed to + allow merge + +V3D 7.x added 'mov' opcodes to the ADD alu, so now it is possible to +move these to the ADD alu to facilitate merging them with other MUL +instructions. +--- + src/broadcom/compiler/qpu_schedule.c | 102 ++++++++++++++++++++++++--- + 1 file changed, 94 insertions(+), 8 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index 334ffdc6d58..caa84254998 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -1086,6 +1086,57 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) + inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + } + ++static bool ++can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op) ++{ ++ switch (op) { ++ case V3D_QPU_M_MOV: ++ case V3D_QPU_M_FMOV: ++ return devinfo->ver >= 71; ++ default: ++ return false; ++ } ++} ++ ++static enum v3d_qpu_mul_op ++mul_op_as_add_op(enum v3d_qpu_mul_op op) ++{ ++ switch (op) { ++ case V3D_QPU_M_MOV: ++ return V3D_QPU_A_MOV; ++ case V3D_QPU_M_FMOV: ++ return V3D_QPU_A_FMOV; ++ default: ++ unreachable("unexpected mov opcode"); ++ } ++} ++ ++static void ++qpu_convert_mul_to_add(struct v3d_qpu_instr *inst) ++{ ++ STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul)); ++ assert(inst->alu.mul.op != V3D_QPU_M_NOP); ++ assert(inst->alu.add.op == V3D_QPU_A_NOP); ++ ++ memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add)); ++ inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op); ++ inst->alu.mul.op = V3D_QPU_M_NOP; ++ ++ inst->flags.ac = inst->flags.mc; ++ inst->flags.apf = inst->flags.mpf; ++ inst->flags.auf = inst->flags.muf; ++ inst->flags.mc = V3D_QPU_COND_NONE; ++ inst->flags.mpf = V3D_QPU_PF_NONE; ++ inst->flags.muf = V3D_QPU_UF_NONE; ++ ++ inst->alu.add.output_pack = inst->alu.mul.output_pack; ++ inst->alu.add.a.unpack = inst->alu.mul.a.unpack; ++ inst->alu.add.b.unpack = inst->alu.mul.b.unpack; ++ inst->alu.mul.output_pack = V3D_QPU_PACK_NONE; ++ inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; ++ inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; ++} ++ + static bool + qpu_merge_inst(const struct v3d_device_info *devinfo, + struct v3d_qpu_instr *result, +@@ -1151,17 +1202,52 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, + } + } + ++ struct v3d_qpu_instr add_inst; + if (b->alu.mul.op != V3D_QPU_M_NOP) { +- if (a->alu.mul.op != V3D_QPU_M_NOP) +- return false; +- merge.alu.mul = b->alu.mul; ++ if (a->alu.mul.op == V3D_QPU_M_NOP) { ++ merge.alu.mul = b->alu.mul; ++ ++ merge.flags.mc = b->flags.mc; ++ merge.flags.mpf = b->flags.mpf; ++ merge.flags.muf = b->flags.muf; ++ ++ mul_instr = b; ++ add_instr = a; ++ } ++ /* If a's mul op is used but its add op is not, then see if we ++ * can convert either a's mul op or b's mul op to an add op ++ * so we can merge. ++ */ ++ else if (a->alu.add.op == V3D_QPU_A_NOP && ++ can_do_mul_as_add(devinfo, b->alu.mul.op)) { ++ add_inst = *b; ++ qpu_convert_mul_to_add(&add_inst); + +- merge.flags.mc = b->flags.mc; +- merge.flags.mpf = b->flags.mpf; +- merge.flags.muf = b->flags.muf; ++ merge.alu.add = add_inst.alu.add; + +- mul_instr = b; +- add_instr = a; ++ merge.flags.ac = b->flags.mc; ++ merge.flags.apf = b->flags.mpf; ++ merge.flags.auf = b->flags.muf; ++ ++ mul_instr = a; ++ add_instr = &add_inst; ++ } else if (a->alu.add.op == V3D_QPU_A_NOP && ++ can_do_mul_as_add(devinfo, a->alu.mul.op)) { ++ add_inst = *a; ++ qpu_convert_mul_to_add(&add_inst); ++ ++ merge = add_inst; ++ merge.alu.mul = b->alu.mul; ++ ++ merge.flags.mc = b->flags.mc; ++ merge.flags.mpf = b->flags.mpf; ++ merge.flags.muf = b->flags.muf; ++ ++ mul_instr = b; ++ add_instr = &add_inst; ++ } else { ++ return false; ++ } + } + + /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0057-broadcom-compiler-implement-small-immediates-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0057-broadcom-compiler-implement-small-immediates-for-v71.patch new file mode 100644 index 0000000000..02310764ef --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0057-broadcom-compiler-implement-small-immediates-for-v71.patch @@ -0,0 +1,418 @@ +From b59b3725fb16f4ab1ac0db86a5452a4ed6176074 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 3 Nov 2021 10:34:19 +0100 +Subject: [PATCH 057/142] broadcom/compiler: implement small immediates for v71 + +--- + src/broadcom/compiler/qpu_schedule.c | 90 +++++++++++++------ + src/broadcom/compiler/qpu_validate.c | 20 ++++- + .../compiler/vir_opt_small_immediates.c | 26 +++++- + src/broadcom/compiler/vir_to_qpu.c | 11 ++- + src/broadcom/qpu/qpu_disasm.c | 1 - + src/broadcom/qpu/qpu_instr.c | 8 +- + src/broadcom/qpu/qpu_instr.h | 2 +- + src/broadcom/qpu/qpu_pack.c | 36 ++++---- + 8 files changed, 139 insertions(+), 55 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index caa84254998..bd1c920848a 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -714,7 +714,6 @@ qpu_instruction_uses_rf(const struct v3d_device_info *devinfo, + !inst->sig.small_imm_b && (inst->raddr_b == waddr)) + return true; + } else { +- /* FIXME: skip if small immediate */ + if (v3d71_qpu_reads_raddr(inst, waddr)) + return true; + } +@@ -948,10 +947,11 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a, + return raddrs_used; + } + +-/* Take two instructions and attempt to merge their raddr fields +- * into one merged instruction. Returns false if the two instructions +- * access more than two different rf registers between them, or more +- * than one rf register and one small immediate. ++/* Takes two instructions and attempts to merge their raddr fields (including ++ * small immediates) into one merged instruction. For V3D 4.x, returns false ++ * if the two instructions access more than two different rf registers between ++ * them, or more than one rf register and one small immediate. For 7.x returns ++ * false if both instructions use small immediates. + */ + static bool + qpu_merge_raddrs(struct v3d_qpu_instr *result, +@@ -959,6 +959,27 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, + const struct v3d_qpu_instr *mul_instr, + const struct v3d_device_info *devinfo) + { ++ if (devinfo->ver >= 71) { ++ assert(add_instr->sig.small_imm_a + ++ add_instr->sig.small_imm_b <= 1); ++ assert(add_instr->sig.small_imm_c + ++ add_instr->sig.small_imm_d == 0); ++ assert(mul_instr->sig.small_imm_a + ++ mul_instr->sig.small_imm_b == 0); ++ assert(mul_instr->sig.small_imm_c + ++ mul_instr->sig.small_imm_d <= 1); ++ ++ result->sig.small_imm_a = add_instr->sig.small_imm_a; ++ result->sig.small_imm_b = add_instr->sig.small_imm_b; ++ result->sig.small_imm_c = mul_instr->sig.small_imm_c; ++ result->sig.small_imm_d = mul_instr->sig.small_imm_d; ++ ++ return (result->sig.small_imm_a + ++ result->sig.small_imm_b + ++ result->sig.small_imm_c + ++ result->sig.small_imm_d) <= 1; ++ } ++ + assert(devinfo->ver <= 42); + + uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr); +@@ -1060,7 +1081,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op) + } + + static void +-qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) ++qpu_convert_add_to_mul(const struct v3d_device_info *devinfo, ++ struct v3d_qpu_instr *inst) + { + STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add)); + assert(inst->alu.add.op != V3D_QPU_A_NOP); +@@ -1084,6 +1106,18 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) + inst->alu.add.output_pack = V3D_QPU_PACK_NONE; + inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; + inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; ++ ++ if (devinfo->ver >= 71) { ++ assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d); ++ assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1); ++ if (inst->sig.small_imm_a) { ++ inst->sig.small_imm_c = true; ++ inst->sig.small_imm_a = false; ++ } else if (inst->sig.small_imm_b) { ++ inst->sig.small_imm_d = true; ++ inst->sig.small_imm_b = false; ++ } ++ } + } + + static bool +@@ -1135,6 +1169,16 @@ qpu_convert_mul_to_add(struct v3d_qpu_instr *inst) + inst->alu.mul.output_pack = V3D_QPU_PACK_NONE; + inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; + inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; ++ ++ assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b); ++ assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1); ++ if (inst->sig.small_imm_c) { ++ inst->sig.small_imm_a = true; ++ inst->sig.small_imm_c = false; ++ } else if (inst->sig.small_imm_d) { ++ inst->sig.small_imm_b = true; ++ inst->sig.small_imm_d = false; ++ } + } + + static bool +@@ -1173,20 +1217,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, + else if (a->alu.mul.op == V3D_QPU_M_NOP && + can_do_add_as_mul(b->alu.add.op)) { + mul_inst = *b; +- qpu_convert_add_to_mul(&mul_inst); ++ qpu_convert_add_to_mul(devinfo, &mul_inst); + + merge.alu.mul = mul_inst.alu.mul; + +- merge.flags.mc = b->flags.ac; +- merge.flags.mpf = b->flags.apf; +- merge.flags.muf = b->flags.auf; ++ merge.flags.mc = mul_inst.flags.mc; ++ merge.flags.mpf = mul_inst.flags.mpf; ++ merge.flags.muf = mul_inst.flags.muf; + + add_instr = a; + mul_instr = &mul_inst; + } else if (a->alu.mul.op == V3D_QPU_M_NOP && + can_do_add_as_mul(a->alu.add.op)) { + mul_inst = *a; +- qpu_convert_add_to_mul(&mul_inst); ++ qpu_convert_add_to_mul(devinfo, &mul_inst); + + merge = mul_inst; + merge.alu.add = b->alu.add; +@@ -1225,9 +1269,9 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, + + merge.alu.add = add_inst.alu.add; + +- merge.flags.ac = b->flags.mc; +- merge.flags.apf = b->flags.mpf; +- merge.flags.auf = b->flags.muf; ++ merge.flags.ac = add_inst.flags.ac; ++ merge.flags.apf = add_inst.flags.apf; ++ merge.flags.auf = add_inst.flags.auf; + + mul_instr = a; + add_instr = &add_inst; +@@ -1252,17 +1296,12 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, + + /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and + * they have restrictions on the number of raddrs that can be adressed +- * in a single instruction. +- * +- * FIXME: for V3D 7.x we can't merge instructions if they address more +- * than one small immediate. For now, we don't support small immediates, +- * so it is not a problem. ++ * in a single instruction. In V3D 7.x, we don't have that restriction, ++ * but we are still limited to a single small immediate per instruction. + */ +- if (devinfo->ver <= 42) { +- if (add_instr && mul_instr && +- !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) { +- return false; +- } ++ if (add_instr && mul_instr && ++ !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) { ++ return false; + } + + merge.sig.thrsw |= b->sig.thrsw; +@@ -1273,7 +1312,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, + merge.sig.ldtmu |= b->sig.ldtmu; + merge.sig.ldvary |= b->sig.ldvary; + merge.sig.ldvpm |= b->sig.ldvpm; +- merge.sig.small_imm_b |= b->sig.small_imm_b; + merge.sig.ldtlb |= b->sig.ldtlb; + merge.sig.ldtlbu |= b->sig.ldtlbu; + merge.sig.ucb |= b->sig.ucb; +@@ -1933,8 +1971,6 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, + if (c->devinfo->ver >= 71) { + /* RF2-3 might be overwritten during the delay slots by + * fragment shader setup. +- * +- * FIXME: handle small immediate cases + */ + if (v3d71_qpu_reads_raddr(inst, 2) || + v3d71_qpu_reads_raddr(inst, 3)) { +diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c +index fde6695d59b..41070484286 100644 +--- a/src/broadcom/compiler/qpu_validate.c ++++ b/src/broadcom/compiler/qpu_validate.c +@@ -116,8 +116,24 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) + return; + + if (devinfo->ver < 71) { +- if (inst->sig.small_imm_a || inst->sig.small_imm_c || inst->sig.small_imm_d) +- fail_instr(state, "small imm a/c/d added after V3D 7.1"); ++ if (inst->sig.small_imm_a || inst->sig.small_imm_c || ++ inst->sig.small_imm_d) { ++ fail_instr(state, "small imm a/c/d added after V3D 7.1"); ++ } ++ } else { ++ if ((inst->sig.small_imm_a || inst->sig.small_imm_b) && ++ !vir_is_add(qinst)) { ++ fail_instr(state, "small imm a/b used but no ADD inst"); ++ } ++ if ((inst->sig.small_imm_c || inst->sig.small_imm_d) && ++ !vir_is_mul(qinst)) { ++ fail_instr(state, "small imm c/d used but no MUL inst"); ++ } ++ if (inst->sig.small_imm_a + inst->sig.small_imm_b + ++ inst->sig.small_imm_c + inst->sig.small_imm_d > 1) { ++ fail_instr(state, "only one small immediate can be " ++ "enabled per instruction"); ++ } + } + + /* LDVARY writes r5 two instructions later and LDUNIF writes +diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c +index df0d6c36c9b..ed5bc011964 100644 +--- a/src/broadcom/compiler/vir_opt_small_immediates.c ++++ b/src/broadcom/compiler/vir_opt_small_immediates.c +@@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c) + /* The small immediate value sits in the raddr B field, so we + * can't have 2 small immediates in one instruction (unless + * they're the same value, but that should be optimized away +- * elsewhere). ++ * elsewhere). Since 7.x we can encode small immediates in ++ * any raddr field, but each instruction can still only use ++ * one. + */ + bool uses_small_imm = false; + for (int i = 0; i < vir_get_nsrc(inst); i++) { +@@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c) + */ + struct v3d_qpu_sig new_sig = inst->qpu.sig; + uint32_t sig_packed; +- new_sig.small_imm_b = true; ++ if (c->devinfo->ver <= 42) { ++ new_sig.small_imm_b = true; ++ } else { ++ if (vir_is_add(inst)) { ++ if (i == 0) ++ new_sig.small_imm_a = true; ++ else ++ new_sig.small_imm_b = true; ++ } else { ++ if (i == 0) ++ new_sig.small_imm_c = true; ++ else ++ new_sig.small_imm_d = true; ++ } ++ } ++ + if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed)) + continue; + +@@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c) + vir_dump_inst(c, inst); + fprintf(stderr, "\n"); + } +- inst->qpu.sig.small_imm_b = true; ++ inst->qpu.sig.small_imm_a = new_sig.small_imm_a; ++ inst->qpu.sig.small_imm_b = new_sig.small_imm_b; ++ inst->qpu.sig.small_imm_c = new_sig.small_imm_c; ++ inst->qpu.sig.small_imm_d = new_sig.small_imm_d; + inst->qpu.raddr_b = packed; + + inst->src[i].file = QFILE_SMALL_IMM; +diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c +index cbbb495592b..4ed184cbbcb 100644 +--- a/src/broadcom/compiler/vir_to_qpu.c ++++ b/src/broadcom/compiler/vir_to_qpu.c +@@ -89,8 +89,15 @@ new_qpu_nop_before(struct qinst *inst) + static void + v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src) + { +- if (src.smimm) +- unreachable("v3d71_set_src: pending handling small immediates"); ++ /* If we have a small immediate move it from inst->raddr_b to the ++ * corresponding raddr. ++ */ ++ if (src.smimm) { ++ assert(instr->sig.small_imm_a || instr->sig.small_imm_b || ++ instr->sig.small_imm_c || instr->sig.small_imm_d); ++ *raddr = instr->raddr_b; ++ return; ++ } + + assert(!src.magic); + *raddr = src.index; +diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c +index b613de781dc..c1590a760de 100644 +--- a/src/broadcom/qpu/qpu_disasm.c ++++ b/src/broadcom/qpu/qpu_disasm.c +@@ -113,7 +113,6 @@ v3d71_qpu_disasm_raddr(struct disasm_state *disasm, + } + + if (is_small_imm) { +- unreachable("Pending handling small immediates"); + uint32_t val; + ASSERTED bool ok = + v3d_qpu_small_imm_unpack(disasm->devinfo, +diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c +index f54ce7210fb..c30f4bbbccf 100644 +--- a/src/broadcom/qpu/qpu_instr.c ++++ b/src/broadcom/qpu/qpu_instr.c +@@ -975,10 +975,10 @@ v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr) + int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op); + int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op); + +- return (add_nsrc > 0 && inst->alu.add.a.raddr == raddr) || +- (add_nsrc > 1 && inst->alu.add.b.raddr == raddr) || +- (mul_nsrc > 0 && inst->alu.mul.a.raddr == raddr) || +- (mul_nsrc > 1 && inst->alu.mul.b.raddr == raddr); ++ return (add_nsrc > 0 && !inst->sig.small_imm_a && inst->alu.add.a.raddr == raddr) || ++ (add_nsrc > 1 && !inst->sig.small_imm_b && inst->alu.add.b.raddr == raddr) || ++ (mul_nsrc > 0 && !inst->sig.small_imm_c && inst->alu.mul.a.raddr == raddr) || ++ (mul_nsrc > 1 && !inst->sig.small_imm_d && inst->alu.mul.b.raddr == raddr); + } + + bool +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index dece45c5c54..d408fb426fa 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -402,7 +402,7 @@ struct v3d_qpu_instr { + uint8_t sig_addr; + bool sig_magic; /* If the signal writes to a magic address */ + uint8_t raddr_a; /* V3D 4.x */ +- uint8_t raddr_b; /* V3D 4.x*/ ++ uint8_t raddr_b; /* V3D 4.x (holds packed small immediate in 7.x too) */ + struct v3d_qpu_flags flags; + + union { +diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c +index ed5a8bc667d..7984712d527 100644 +--- a/src/broadcom/qpu/qpu_pack.c ++++ b/src/broadcom/qpu/qpu_pack.c +@@ -1218,16 +1218,11 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst + + instr->alu.add.op = desc->op; + +- /* FADD/FADDNF and FMIN/FMAX are determined by the orders of the ++ /* FADD/FADDNF and FMIN/FMAX are determined by the order of the + * operands. + */ +- /* FIXME: for now hardcoded values, until we got the small_imm support +- * in place +- */ +- uint32_t small_imm_a = 0; +- uint32_t small_imm_b = 0; +- if (small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a > +- small_imm_b *256 + (op & 3) * 64 + raddr_b) { ++ if (instr->sig.small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a > ++ instr->sig.small_imm_b * 256 + (op & 3) * 64 + raddr_b) { + if (instr->alu.add.op == V3D_QPU_A_FMIN) + instr->alu.add.op = V3D_QPU_A_FMAX; + if (instr->alu.add.op == V3D_QPU_A_FADD) +@@ -1858,11 +1853,6 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo, + uint32_t output_pack; + uint32_t a_unpack; + uint32_t b_unpack; +- /* FIXME: for now hardcoded values, until we got the small_imm +- * support in place +- */ +- uint32_t small_imm_a = 0; +- uint32_t small_imm_b = 0; + + if (instr->alu.add.op != V3D_QPU_A_FCMP) { + if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack, +@@ -1886,8 +1876,8 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo, + * distinguished by which order their operands come in. + */ + bool ordering = +- small_imm_a * 256 + a_unpack * 64 + raddr_a > +- small_imm_b * 256 + b_unpack * 64 + raddr_b; ++ instr->sig.small_imm_a * 256 + a_unpack * 64 + raddr_a > ++ instr->sig.small_imm_b * 256 + b_unpack * 64 + raddr_b; + if (((instr->alu.add.op == V3D_QPU_A_FMIN || + instr->alu.add.op == V3D_QPU_A_FADD) && ordering) || + ((instr->alu.add.op == V3D_QPU_A_FMAX || +@@ -1901,6 +1891,22 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo, + temp = raddr_a; + raddr_a = raddr_b; + raddr_b = temp; ++ ++ /* If we are swapping raddr_a/b we also need to swap ++ * small_imm_a/b. ++ */ ++ if (instr->sig.small_imm_a || instr->sig.small_imm_b) { ++ assert(instr->sig.small_imm_a != ++ instr->sig.small_imm_b); ++ struct v3d_qpu_sig new_sig = instr->sig; ++ new_sig.small_imm_a = !instr->sig.small_imm_a; ++ new_sig.small_imm_b = !instr->sig.small_imm_b; ++ uint32_t sig; ++ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig)) ++ return false; ++ *packed_instr &= ~V3D_QPU_SIG_MASK; ++ *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG); ++ } + } + + opcode |= a_unpack << 2; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0058-broadcom-compiler-update-thread-end-restrictions-for.patch b/projects/RPi/devices/RPi5/patches/mesa/0058-broadcom-compiler-update-thread-end-restrictions-for.patch new file mode 100644 index 0000000000..cd5c07f5eb --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0058-broadcom-compiler-update-thread-end-restrictions-for.patch @@ -0,0 +1,61 @@ +From 3af87d2672da7c928ecf8a0a1cd1bef8a6729364 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Mon, 22 Nov 2021 12:56:03 +0100 +Subject: [PATCH 058/142] broadcom/compiler: update thread end restrictions for + v7.x + +In 4.x it is not allowed to write to the register file in the last +3 instructions, but in 7.x we only have this restriction in the +thread end instruction itself, and only if the write comes from +the ALU ports. +--- + src/broadcom/compiler/qpu_schedule.c | 31 ++++++++++++++++++++-------- + 1 file changed, 22 insertions(+), 9 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index bd1c920848a..cba16c77d67 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -1938,17 +1938,30 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, + return false; + } + +- /* No writing physical registers at the end. */ +- bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP; +- bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP; +- if ((!add_is_nop && !inst->alu.add.magic_write) || +- (!mul_is_nop && !inst->alu.mul.magic_write)) { +- return false; ++ if (c->devinfo->ver <= 42) { ++ /* No writing physical registers at the end. */ ++ bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP; ++ bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP; ++ if ((!add_is_nop && !inst->alu.add.magic_write) || ++ (!mul_is_nop && !inst->alu.mul.magic_write)) { ++ return false; ++ } ++ ++ if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) && ++ !inst->sig_magic) { ++ return false; ++ } + } + +- if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) && +- !inst->sig_magic) { +- return false; ++ if (c->devinfo->ver >= 71) { ++ /* The thread end instruction must not write to the ++ * register file via the add/mul ALUs. ++ */ ++ if (slot == 0 && ++ (!inst->alu.add.magic_write || ++ !inst->alu.mul.magic_write)) { ++ return false; ++ } + } + + if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0059-broadcom-compiler-update-ldvary-thread-switch-delay-.patch b/projects/RPi/devices/RPi5/patches/mesa/0059-broadcom-compiler-update-ldvary-thread-switch-delay-.patch new file mode 100644 index 0000000000..515f12d5d5 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0059-broadcom-compiler-update-ldvary-thread-switch-delay-.patch @@ -0,0 +1,112 @@ +From 7cfd5b808bb2f1cb17f57435cb5d411c4ac3aa6c Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 23 Nov 2021 10:04:49 +0100 +Subject: [PATCH 059/142] broadcom/compiler: update ldvary thread switch delay + slot restriction for v7.x + +In V3D 7.x we don't have accumulators which would not survive a thread +switch, so the only restriction is that ldvary can't be placed in the second +delay slot of a thread switch. + +shader-db results for UnrealEngine4 shaders: + +total instructions in shared programs: 446458 -> 446401 (-0.01%) +instructions in affected programs: 13492 -> 13435 (-0.42%) +helped: 58 +HURT: 3 +Instructions are helped. + +total nops in shared programs: 19571 -> 19541 (-0.15%) +nops in affected programs: 161 -> 131 (-18.63%) +helped: 30 +HURT: 0 +Nops are helped. +--- + src/broadcom/compiler/qpu_schedule.c | 33 +++++++++++++++++++++------- + src/broadcom/compiler/qpu_validate.c | 10 +++++++-- + 2 files changed, 33 insertions(+), 10 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index cba16c77d67..32f651851cf 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -1491,11 +1491,20 @@ retry: + * ldvary now if the follow-up fixup would place + * it in the delay slots of a thrsw, which is not + * allowed and would prevent the fixup from being +- * successful. ++ * successful. In V3D 7.x we can allow this to happen ++ * as long as it is not the last delay slot. + */ +- if (inst->sig.ldvary && +- scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) { +- continue; ++ if (inst->sig.ldvary) { ++ if (c->devinfo->ver <= 42 && ++ scoreboard->last_thrsw_tick + 2 >= ++ scoreboard->tick - 1) { ++ continue; ++ } ++ if (c->devinfo->ver >= 71 && ++ scoreboard->last_thrsw_tick + 2 == ++ scoreboard->tick - 1) { ++ continue; ++ } + } + + /* We can emit a new tmu lookup with a previous ldtmu +@@ -2020,8 +2029,12 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, + if (slot > 0 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu)) + return false; + +- if (slot > 0 && qinst->qpu.sig.ldvary) +- return false; ++ if (qinst->qpu.sig.ldvary) { ++ if (c->devinfo->ver <= 42 && slot > 0) ++ return false; ++ if (c->devinfo->ver >= 71 && slot == 2) ++ return false; ++ } + + /* unifa and the following 3 instructions can't overlap a + * thread switch/end. The docs further clarify that this means +@@ -2618,9 +2631,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c, + + /* We can't put an ldvary in the delay slots of a thrsw. We should've + * prevented this when pairing up the ldvary with another instruction +- * and flagging it for a fixup. ++ * and flagging it for a fixup. In V3D 7.x this is limited only to the ++ * second delay slot. + */ +- assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1); ++ assert((devinfo->ver <= 42 && ++ scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) || ++ (devinfo->ver >= 71 && ++ scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1)); + + /* Move the ldvary to the previous instruction and remove it from the + * current one. +diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c +index 41070484286..4f09aa8aef4 100644 +--- a/src/broadcom/compiler/qpu_validate.c ++++ b/src/broadcom/compiler/qpu_validate.c +@@ -215,8 +215,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) + "SFU write started during THRSW delay slots "); + } + +- if (inst->sig.ldvary) +- fail_instr(state, "LDVARY during THRSW delay slots"); ++ if (inst->sig.ldvary) { ++ if (devinfo->ver <= 42) ++ fail_instr(state, "LDVARY during THRSW delay slots"); ++ if (devinfo->ver >= 71 && ++ state->ip - state->last_thrsw_ip == 2) { ++ fail_instr(state, "LDVARY in 2nd THRSW delay slot"); ++ } ++ } + } + + (void)qpu_magic_waddr_matches; /* XXX */ +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0060-broadcom-compiler-lift-restriction-for-branch-msfign.patch b/projects/RPi/devices/RPi5/patches/mesa/0060-broadcom-compiler-lift-restriction-for-branch-msfign.patch new file mode 100644 index 0000000000..7c78c6938b --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0060-broadcom-compiler-lift-restriction-for-branch-msfign.patch @@ -0,0 +1,30 @@ +From ca4063d627cd31c589a8e8688f2876dd8211d1bc Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Thu, 25 Nov 2021 08:31:02 +0100 +Subject: [PATCH 060/142] broadcom/compiler: lift restriction for branch + + msfign after setmsf for v7.x + +--- + src/broadcom/compiler/qpu_schedule.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index 32f651851cf..476eae691ab 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -2373,10 +2373,11 @@ emit_branch(struct v3d_compile *c, + assert(scoreboard->last_branch_tick + 3 < branch_tick); + assert(scoreboard->last_unifa_write_tick + 3 < branch_tick); + +- /* Can't place a branch with msfign != 0 and cond != 0,2,3 after ++ /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after + * setmsf. + */ + bool is_safe_msf_branch = ++ c->devinfo->ver >= 71 || + inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE || + inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS || + inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 || +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0061-broadcom-compiler-start-allocating-from-RF-4-in-V7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0061-broadcom-compiler-start-allocating-from-RF-4-in-V7.x.patch new file mode 100644 index 0000000000..8bff29c318 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0061-broadcom-compiler-start-allocating-from-RF-4-in-V7.x.patch @@ -0,0 +1,38 @@ +From 167510aa43bbcf06e57a64495cee40e8cdaf5f8b Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Fri, 26 Nov 2021 10:37:05 +0100 +Subject: [PATCH 061/142] broadcom/compiler: start allocating from RF 4 in V7.x + +In V3D 4.x we start at RF3 so that we allocate RF0-2 only if there +aren't any other RFs available. This is useful with small shaders +to ensure that our TLB writes don't use these registers because +these are the last instructions we emit in fragment shaders and +the last instructions in a program can't write to these registers, +so if we do, we need to emit NOPs. + +In V3D 7.x the registers affected by this restriction are RF2-3, +so we choose to start at RF4. +--- + src/broadcom/compiler/vir_register_allocate.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index 6f7b1ca0589..440b093a636 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -1234,9 +1234,10 @@ v3d_register_allocate(struct v3d_compile *c) + .phys_index = phys_index, + .next_acc = 0, + /* Start at RF3, to try to keep the TLB writes from using +- * RF0-2. ++ * RF0-2. Start at RF4 in 7.x to prevent TLB writes from ++ * using RF2-3. + */ +- .next_phys = 3, ++ .next_phys = c->devinfo->ver <= 42 ? 3 : 4, + .nodes = &c->nodes, + .devinfo = c->devinfo, + }; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0062-broadcom-compiler-validate-restrictions-after-TLB-Z-.patch b/projects/RPi/devices/RPi5/patches/mesa/0062-broadcom-compiler-validate-restrictions-after-TLB-Z-.patch new file mode 100644 index 0000000000..f1f210e47a --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0062-broadcom-compiler-validate-restrictions-after-TLB-Z-.patch @@ -0,0 +1,71 @@ +From d47ea903b96e43b07bdef21f8026da818e30fcd1 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Thu, 25 Nov 2021 13:00:34 +0100 +Subject: [PATCH 062/142] broadcom/compiler: validate restrictions after TLB Z + write + +--- + src/broadcom/compiler/qpu_validate.c | 28 ++++++++++++++++++++++++++++ + 1 file changed, 28 insertions(+) + +diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c +index 4f09aa8aef4..1082fb7d50a 100644 +--- a/src/broadcom/compiler/qpu_validate.c ++++ b/src/broadcom/compiler/qpu_validate.c +@@ -41,6 +41,7 @@ struct v3d_qpu_validate_state { + int last_sfu_write; + int last_branch_ip; + int last_thrsw_ip; ++ int first_tlb_z_write; + + /* Set when we've found the last-THRSW signal, or if we were started + * in single-segment mode. +@@ -110,11 +111,37 @@ static void + qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) + { + const struct v3d_device_info *devinfo = state->c->devinfo; ++ ++ if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write) ++ state->first_tlb_z_write = state->ip; ++ + const struct v3d_qpu_instr *inst = &qinst->qpu; + ++ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && ++ state->first_tlb_z_write >= 0 && ++ state->ip > state->first_tlb_z_write && ++ inst->branch.msfign != V3D_QPU_MSFIGN_NONE && ++ inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS && ++ inst->branch.cond != V3D_QPU_BRANCH_COND_A0 && ++ inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) { ++ fail_instr(state, "Implicit branch MSF read after TLB Z write"); ++ } ++ + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) + return; + ++ if (inst->alu.add.op == V3D_QPU_A_SETMSF && ++ state->first_tlb_z_write >= 0 && ++ state->ip > state->first_tlb_z_write) { ++ fail_instr(state, "SETMSF after TLB Z write"); ++ } ++ ++ if (state->first_tlb_z_write >= 0 && ++ state->ip > state->first_tlb_z_write && ++ inst->alu.add.op == V3D_QPU_A_MSF) { ++ fail_instr(state, "MSF read after TLB Z write"); ++ } ++ + if (devinfo->ver < 71) { + if (inst->sig.small_imm_a || inst->sig.small_imm_c || + inst->sig.small_imm_d) { +@@ -348,6 +375,7 @@ qpu_validate(struct v3d_compile *c) + .last_sfu_write = -10, + .last_thrsw_ip = -10, + .last_branch_ip = -10, ++ .first_tlb_z_write = INT_MAX, + .ip = 0, + + .last_thrsw_found = !c->last_thrsw, +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0063-broadcom-compiler-lift-restriction-on-vpmwt-in-last-.patch b/projects/RPi/devices/RPi5/patches/mesa/0063-broadcom-compiler-lift-restriction-on-vpmwt-in-last-.patch new file mode 100644 index 0000000000..7cfdab4c05 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0063-broadcom-compiler-lift-restriction-on-vpmwt-in-last-.patch @@ -0,0 +1,26 @@ +From 6cdf01fad49489b5fc66d231b527de5245d5de32 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Mon, 29 Nov 2021 13:23:11 +0100 +Subject: [PATCH 063/142] broadcom/compiler: lift restriction on vpmwt in last + instruction for V3D 7.x + +--- + src/broadcom/compiler/qpu_schedule.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index 476eae691ab..77fb6a794e6 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -1934,7 +1934,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, + if (slot > 0 && qinst->uniform != ~0) + return false; + +- if (v3d_qpu_waits_vpm(inst)) ++ if (c->devinfo->ver <= 42 && v3d_qpu_waits_vpm(inst)) + return false; + + if (inst->sig.ldvary) +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0064-broadcom-compiler-fix-up-copy-propagation-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0064-broadcom-compiler-fix-up-copy-propagation-for-v71.patch new file mode 100644 index 0000000000..080764c6d0 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0064-broadcom-compiler-fix-up-copy-propagation-for-v71.patch @@ -0,0 +1,134 @@ +From acc54637f0787ba4dc887130c25c628ccdaf4e38 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 9 Nov 2021 11:34:59 +0100 +Subject: [PATCH 064/142] broadcom/compiler: fix up copy propagation for v71 + +Update rules for unsafe copy propagations to match v7.x. +--- + .../compiler/vir_opt_copy_propagate.c | 83 +++++++++++++------ + 1 file changed, 56 insertions(+), 27 deletions(-) + +diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c +index c4aa7255a17..1260838ca05 100644 +--- a/src/broadcom/compiler/vir_opt_copy_propagate.c ++++ b/src/broadcom/compiler/vir_opt_copy_propagate.c +@@ -35,7 +35,7 @@ + #include "v3d_compiler.h" + + static bool +-is_copy_mov(struct qinst *inst) ++is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst) + { + if (!inst) + return false; +@@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst) + return false; + } + +- switch (inst->src[0].file) { +- case QFILE_MAGIC: +- /* No copy propagating from R3/R4/R5 -- the MOVs from those +- * are there to register allocate values produced into R3/4/5 +- * to other regs (though hopefully r3/4/5). +- */ +- switch (inst->src[0].index) { +- case V3D_QPU_WADDR_R3: +- case V3D_QPU_WADDR_R4: +- case V3D_QPU_WADDR_R5: +- return false; ++ if (devinfo->ver <= 42) { ++ switch (inst->src[0].file) { ++ case QFILE_MAGIC: ++ /* No copy propagating from R3/R4/R5 -- the MOVs from ++ * those are there to register allocate values produced ++ * into R3/4/5 to other regs (though hopefully r3/4/5). ++ */ ++ switch (inst->src[0].index) { ++ case V3D_QPU_WADDR_R3: ++ case V3D_QPU_WADDR_R4: ++ case V3D_QPU_WADDR_R5: ++ return false; ++ default: ++ break; ++ } ++ break; ++ ++ case QFILE_REG: ++ switch (inst->src[0].index) { ++ case 0: ++ case 1: ++ case 2: ++ /* MOVs from rf0/1/2 are only to track the live ++ * intervals for W/centroid W/Z. ++ */ ++ return false; ++ } ++ break; ++ + default: + break; + } +- break; +- +- case QFILE_REG: +- switch (inst->src[0].index) { +- case 0: +- case 1: +- case 2: +- /* MOVs from rf0/1/2 are only to track the live ++ } else { ++ assert(devinfo->ver >= 71); ++ switch (inst->src[0].file) { ++ case QFILE_REG: ++ switch (inst->src[0].index) { ++ /* MOVs from rf1/2/3 are only to track the live + * intervals for W/centroid W/Z. ++ * ++ * Note: rf0 can be implicitly written by ldvary ++ * (no temp involved), so it is not an SSA value and ++ * could clash with writes to other temps that are ++ * also allocated to rf0. In theory, that would mean ++ * that we can't copy propagate from it, but we handle ++ * this at register allocation time, preventing temps ++ * from being allocated to rf0 while the rf0 value from ++ * ldvary is still live. + */ +- return false; +- } +- break; ++ case 1: ++ case 2: ++ case 3: ++ return false; ++ } ++ break; + +- default: +- break; ++ default: ++ break; ++ } + } + + return true; +@@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) + */ + struct qinst *mov = movs[inst->src[i].index]; + if (!mov) { +- if (!is_copy_mov(c->defs[inst->src[i].index])) ++ if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index])) + continue; + mov = c->defs[inst->src[i].index]; + +@@ -245,7 +274,7 @@ vir_opt_copy_propagate(struct v3d_compile *c) + + apply_kills(c, movs, inst); + +- if (is_copy_mov(inst)) ++ if (is_copy_mov(c->devinfo, inst)) + movs[inst->dst.index] = inst; + } + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0065-broadcom-qpu-new-packing-conversion-v71-instructions.patch b/projects/RPi/devices/RPi5/patches/mesa/0065-broadcom-qpu-new-packing-conversion-v71-instructions.patch new file mode 100644 index 0000000000..5bd7e35514 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0065-broadcom-qpu-new-packing-conversion-v71-instructions.patch @@ -0,0 +1,150 @@ +From c340f7f1eb4a1e5c0fafe1ea2f801f2ebaf82d8d Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Fri, 26 Nov 2021 01:24:12 +0100 +Subject: [PATCH 065/142] broadcom/qpu: new packing/conversion v71 instructions + +This commits adds the qpu definitions for several new v71 +instructions. + +Packing: + * vpack does a 2x32 to 2x16 bit integer pack + * v8pack: Pack 2 x 2x16 bit integers into 4x8 bits + * v10pack packs parts of 2 2x16 bit integer into r10g10b10a2. + * v11fpack packs parts of 2 2x16 bit float into r11g11b10 rounding + to nearest + +Conversion to unorm/snorm: + * vftounorm8/vftosnorm8: converts from 2x16-bit floating point + to 2x8 bit unorm/snorm. + * ftounorm16/ftosnorm16: converts floating point to 16-bit + unorm/snorm + * vftounorm10lo: Convert 2x16-bit floating point to 2x10-bit unorm + * vftounorm10hi: Convert 2x16-bit floating point to one 2-bit and one 10-bit unorm +--- + src/broadcom/qpu/qpu_instr.c | 20 ++++++++++++++++++++ + src/broadcom/qpu/qpu_instr.h | 12 ++++++++++++ + src/broadcom/qpu/qpu_pack.c | 12 ++++++++++++ + 3 files changed, 44 insertions(+) + +diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c +index c30f4bbbccf..44f20618a5a 100644 +--- a/src/broadcom/qpu/qpu_instr.c ++++ b/src/broadcom/qpu/qpu_instr.c +@@ -179,6 +179,10 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op) + [V3D_QPU_A_UTOF] = "utof", + [V3D_QPU_A_MOV] = "mov", + [V3D_QPU_A_FMOV] = "fmov", ++ [V3D_QPU_A_VPACK] = "vpack", ++ [V3D_QPU_A_V8PACK] = "v8pack", ++ [V3D_QPU_A_V10PACK] = "v10pack", ++ [V3D_QPU_A_V11FPACK] = "v11fpack", + }; + + if (op >= ARRAY_SIZE(op_names)) +@@ -201,6 +205,12 @@ v3d_qpu_mul_op_name(enum v3d_qpu_mul_op op) + [V3D_QPU_M_MOV] = "mov", + [V3D_QPU_M_NOP] = "nop", + [V3D_QPU_M_FMUL] = "fmul", ++ [V3D_QPU_M_FTOUNORM16] = "ftounorm16", ++ [V3D_QPU_M_FTOSNORM16] = "ftosnorm16", ++ [V3D_QPU_M_VFTOUNORM8] = "vftounorm8", ++ [V3D_QPU_M_VFTOSNORM8] = "vftosnorm8", ++ [V3D_QPU_M_VFTOUNORM10LO] = "vftounorm10lo", ++ [V3D_QPU_M_VFTOUNORM10HI] = "vftounorm10hi", + }; + + if (op >= ARRAY_SIZE(op_names)) +@@ -463,6 +473,10 @@ static const uint8_t add_op_args[] = { + + [V3D_QPU_A_MOV] = D | A, + [V3D_QPU_A_FMOV] = D | A, ++ [V3D_QPU_A_VPACK] = D | A | B, ++ [V3D_QPU_A_V8PACK] = D | A | B, ++ [V3D_QPU_A_V10PACK] = D | A | B, ++ [V3D_QPU_A_V11FPACK] = D | A | B, + }; + + static const uint8_t mul_op_args[] = { +@@ -476,6 +490,12 @@ static const uint8_t mul_op_args[] = { + [V3D_QPU_M_NOP] = 0, + [V3D_QPU_M_MOV] = D | A, + [V3D_QPU_M_FMUL] = D | A | B, ++ [V3D_QPU_M_FTOUNORM16] = D | A, ++ [V3D_QPU_M_FTOSNORM16] = D | A, ++ [V3D_QPU_M_VFTOUNORM8] = D | A, ++ [V3D_QPU_M_VFTOSNORM8] = D | A, ++ [V3D_QPU_M_VFTOUNORM10LO] = D | A, ++ [V3D_QPU_M_VFTOUNORM10HI] = D | A, + }; + + bool +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index d408fb426fa..56eee9f9cac 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -231,6 +231,10 @@ enum v3d_qpu_add_op { + /* V3D 7.x */ + V3D_QPU_A_FMOV, + V3D_QPU_A_MOV, ++ V3D_QPU_A_VPACK, ++ V3D_QPU_A_V8PACK, ++ V3D_QPU_A_V10PACK, ++ V3D_QPU_A_V11FPACK, + }; + + enum v3d_qpu_mul_op { +@@ -244,6 +248,14 @@ enum v3d_qpu_mul_op { + V3D_QPU_M_MOV, + V3D_QPU_M_NOP, + V3D_QPU_M_FMUL, ++ ++ /* V3D 7.x */ ++ V3D_QPU_M_FTOUNORM16, ++ V3D_QPU_M_FTOSNORM16, ++ V3D_QPU_M_VFTOUNORM8, ++ V3D_QPU_M_VFTOSNORM8, ++ V3D_QPU_M_VFTOUNORM10LO, ++ V3D_QPU_M_VFTOUNORM10HI, + }; + + enum v3d_qpu_output_pack { +diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c +index 7984712d527..6cd75adac6d 100644 +--- a/src/broadcom/qpu/qpu_pack.c ++++ b/src/broadcom/qpu/qpu_pack.c +@@ -783,6 +783,9 @@ static const struct opcode_desc add_ops_v71[] = { + { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 }, + ++ { 247, 247, .raddr_mask = ANYOPMASK, V3D_QPU_A_VPACK, 71 }, ++ { 248, 248, .raddr_mask = ANYOPMASK, V3D_QPU_A_V8PACK, 71 }, ++ + { 249, 249, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FMOV, 71 }, + { 249, 249, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FMOV, 71 }, + { 249, 249, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FMOV, 71 }, +@@ -797,6 +800,8 @@ static const struct opcode_desc add_ops_v71[] = { + { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 }, + { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 }, + ++ { 250, 250, .raddr_mask = ANYOPMASK, V3D_QPU_A_V10PACK, 71 }, ++ { 251, 251, .raddr_mask = ANYOPMASK, V3D_QPU_A_V11FPACK, 71 }, + }; + + static const struct opcode_desc mul_ops_v71[] = { +@@ -822,6 +827,13 @@ static const struct opcode_desc mul_ops_v71[] = { + { 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 }, + { 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 }, + ++ { 14, 14, .raddr_mask = OP_MASK(32), V3D_QPU_M_FTOUNORM16, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(33), V3D_QPU_M_FTOSNORM16, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(34), V3D_QPU_M_VFTOUNORM8, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(35), V3D_QPU_M_VFTOSNORM8, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(48), V3D_QPU_M_VFTOUNORM10LO, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(49), V3D_QPU_M_VFTOUNORM10HI, 71 }, ++ + { 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 }, + + { 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL }, +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0066-nir-add-new-opcodes-to-map-new-v71-packing-conversio.patch b/projects/RPi/devices/RPi5/patches/mesa/0066-nir-add-new-opcodes-to-map-new-v71-packing-conversio.patch new file mode 100644 index 0000000000..ee65de9a53 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0066-nir-add-new-opcodes-to-map-new-v71-packing-conversio.patch @@ -0,0 +1,221 @@ +From 4f33de7771621e15aae3e3c60c09fd5a2f29bdac Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Tue, 30 Nov 2021 02:39:20 +0100 +Subject: [PATCH 066/142] nir: add new opcodes to map new v71 + packing/conversion instructions + +Since v71, broadcom hw include specific packing/conversion +instructions, so this commit adds opcodes to be able to make use of +them, specially for image stores: + + * vftounorm8/vftosnorm8: 2x16-bit floating point to 2x8-bit + unorm/snorm + * ftounorm16/ftosnorm16: floating point to 16-bit unorm/snorm + * vftounorm10lo/vftounorm10hi: used to convert a floating point to + a r10g10b10a2 unorm + + * v11fpack: packs 2 2x16 FP into R11G11B10. + * v10pack: pack 2 2x16 integer into R10G10B10A2 + * v8pack: packs 2 2x16 bit integer into 4x8 bits. + * vpack: 2x32 bit to 2x16 integer pack + +For the latter, it can be easly confused with the existing and general +pack_32_2x16_split. But note that this one receives two 16bit integer, +and packs them on a 32bit integer. But broadcom opcode takes two 32bit +integer, takes the lower halfword, and packs them as 2x16 on a 32bit +integer. + +Interestingly broadcom also defines a similar one that packs the +higher halfword. Not used yet. + +FIXME: vftounorm10lo/hi constant expression implementation is somewhat +convoluted. It is likely that it could be implemented in a more easy +way. But it works (passing the tests added with CTS issue #3372, +created with this change in mind). +--- + src/compiler/nir/nir_constant_expressions.py | 106 +++++++++++++++++++ + src/compiler/nir/nir_opcodes.py | 44 ++++++++ + 2 files changed, 150 insertions(+) + +diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py +index e6383b67737..46395d79a89 100644 +--- a/src/compiler/nir/nir_constant_expressions.py ++++ b/src/compiler/nir/nir_constant_expressions.py +@@ -62,6 +62,8 @@ template = """\ + #include "util/softfloat.h" + #include "util/bigmath.h" + #include "util/format/format_utils.h" ++#include "util/format_r11g11b10f.h" ++#include "util/u_math.h" + #include "nir_constant_expressions.h" + + /** +@@ -277,6 +279,110 @@ unpack_half_1x16(uint16_t u) + return _mesa_half_to_float(u); + } + ++/* Broadcom v3d specific instructions */ ++/** ++ * Packs 2 2x16 floating split into a r11g11b10f ++ */ ++static uint32_t v11fpack_v3d(const uint32_t src0, ++ const uint32_t src1) ++{ ++ float rgb[3]; ++ ++ rgb[0] = unpack_half_1x16((src0 & 0xffff)); ++ rgb[1] = unpack_half_1x16((src0 >> 16)); ++ rgb[2] = unpack_half_1x16((src1 & 0xffff)); ++ ++ return float3_to_r11g11b10f(rgb); ++} ++ ++/** ++ * The three methods below are basically wrappers over pack_s/unorm_1x8/1x16, ++ * as it receives a uint16_t val instead of a float ++ */ ++static uint8_t _mesa_half_to_snorm8(uint16_t val) ++{ ++ float x = _mesa_half_to_float(val); ++ ++ return pack_snorm_1x8(x); ++} ++ ++static uint16_t _mesa_float_to_snorm16(uint32_t val) ++{ ++ union fi aux; ++ aux.ui = val; ++ return pack_snorm_1x16(aux.f); ++} ++ ++static uint16_t _mesa_float_to_unorm16(uint32_t val) ++{ ++ union fi aux; ++ aux.ui = val; ++ return pack_unorm_1x16(aux.f); ++} ++ ++/* FIXME: the implementation below of vftounorm10hi/lo is somewhat too ++ * verbose. It is likely that there would be a simpler way to implement ++ * it. ++ */ ++static uint32_t float_pack16_v3d(uint32_t f32) ++{ ++ float f = uif(f32); ++ return _mesa_float_to_half(f); ++} ++ ++static uint32_t float_unpack16_v3d(uint32_t f16) ++{ ++ float f = _mesa_half_to_float(f16); ++ return fui(f); ++} ++ ++static uint32_t vfpack_v3d(uint32_t a, uint32_t b) ++{ ++ return float_pack16_v3d(b) << 16 | float_pack16_v3d(a); ++} ++ ++static uint32_t vfsat_v3d(uint32_t a) ++{ ++ return vfpack_v3d( ++ fui(SATURATE(_mesa_half_to_float(a & 0xffff))), ++ fui(SATURATE(_mesa_half_to_float(a >> 16)))); ++} ++ ++static uint32_t fmul_v3d(uint32_t a, uint32_t b) ++{ ++ float f = uif(a); ++ float g = uif(b); ++ ++ float x = f * g; ++ ++ return fui(x); ++} ++ ++#define L(x) float_unpack16_v3d((x) & 0xffff) ++#define H(x) float_unpack16_v3d((x) >> 16) ++#define V(f,a,b) vfpack_v3d(f(L(a), L(b)), f(H(a), H(b))) ++ ++static uint32_t vfmul_v3d(uint32_t a, uint32_t b) ++{ ++ return V(fmul_v3d, a, b); ++} ++ ++/* Convert 2x16-bit floating point to 2x10-bit unorm */ ++static uint32_t vftounorm10lo(uint32_t src0) ++{ ++ return vfmul_v3d(vfsat_v3d(src0), 0x03ff03ff); ++} ++ ++/* ++ * Convert 2x16-bit floating point to one 2-bit and one ++ * 10-bit unorm ++ */ ++static uint32_t vftounorm10hi(uint32_t src0) ++{ ++ return vfmul_v3d(vfsat_v3d(src0), 0x000303ff); ++} ++ ++ + /* Some typed vector structures to make things like src0.y work */ + typedef int8_t int1_t; + typedef uint8_t uint1_t; +diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py +index e4d87aa6126..63aa7cfa315 100644 +--- a/src/compiler/nir/nir_opcodes.py ++++ b/src/compiler/nir/nir_opcodes.py +@@ -1393,6 +1393,50 @@ for (int i = 0; i < 32; i += 8) { + } + """) + ++# v3d-specific opcodes ++ ++# v3d-specific (v71) instruction that packs parts of 2 2x16 floating point into ++# r11g11b10 bits, rounding to nearest even ++binop_convert("v11fpack_v3d", tuint32, tuint32, "", ++ "v11fpack_v3d(src0, src1)") ++ ++# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The ++# difference with pack_32_2x16_split is that the sources are 32bit too. So it ++# receives 2 32-bit integer, and pack the lower halfword as 2x16 on a 32-bit ++# pack. ++binop_horiz("vpack_v3d", 1, tuint32, 1, tuint32, 1, tuint32, ++ "(src0.x & 0xffff) | (src1.x << 16)") ++ ++# v3d-specific (v71) instruction that packs parts of 2 2x16 integers into r10g10b10a2 ++binop_convert("v10pack_v3d", tuint32, tuint32, "", ++ "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30") ++ ++# v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits: ++# dst[7:0] = src0[7:0] ++# dst[15:8] = src0[23:16] ++# dst[23:16] = src1[7:0] ++# dst[31:24] = src1[23:16] ++opcode("v8pack_v3d", 0, tuint32, [0, 0], [tuint32, tuint32], ++ False, "", ++ "(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) << 16 | (src1 & 0x00ff0000) << 8") ++ ++# v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit unorm/snorm ++unop("vftounorm8_v3d", tuint32, ++ "_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 8) << 16)") ++unop("vftosnorm8_v3d", tuint32, ++ "_mesa_half_to_snorm(src0 & 0xffff, 8) | (_mesa_half_to_snorm(src0 >> 16, 8) << 16)") ++ ++# v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit unorm/snorm ++unop("ftounorm16_v3d", tuint32, "_mesa_float_to_unorm16(src0)") ++unop("ftosnorm16_v3d", tuint32, "_mesa_float_to_snorm16(src0)") ++ ++# v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 bit unorm ++unop("vftounorm10lo_v3d", tuint32, "vftounorm10lo(src0)") ++ ++# v3d-specific (v71) instructions to convert 2x16 bit floating points to one 2-bit ++# and one 10 bit unorm ++unop("vftounorm10hi_v3d", tuint32, "vftounorm10hi(src0)") ++ + # Mali-specific opcodes + unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)")) + unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)")) +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0067-broadcom-compiler-update-image-store-lowering-to-use.patch b/projects/RPi/devices/RPi5/patches/mesa/0067-broadcom-compiler-update-image-store-lowering-to-use.patch new file mode 100644 index 0000000000..911dd462a8 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0067-broadcom-compiler-update-image-store-lowering-to-use.patch @@ -0,0 +1,452 @@ +From 381c29e3ff5237c89380cc53eb2271d1985f4e34 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 2 Dec 2021 13:26:43 +0100 +Subject: [PATCH 067/142] broadcom/compiler: update image store lowering to use + v71 new packing/conversion instructions + +Vulkan shaderdb stats with pattern dEQP-VK.image.*.with_format.*.*: + total instructions in shared programs: 35993 -> 33245 (-7.63%) + instructions in affected programs: 21153 -> 18405 (-12.99%) + helped: 394 + HURT: 1 + Instructions are helped. + + total uniforms in shared programs: 8550 -> 7418 (-13.24%) + uniforms in affected programs: 5136 -> 4004 (-22.04%) + helped: 399 + HURT: 0 + Uniforms are helped. + + total max-temps in shared programs: 6014 -> 5905 (-1.81%) + max-temps in affected programs: 473 -> 364 (-23.04%) + helped: 58 + HURT: 0 + Max-temps are helped. + + total nops in shared programs: 1515 -> 1504 (-0.73%) + nops in affected programs: 46 -> 35 (-23.91%) + helped: 14 + HURT: 2 + Inconclusive result (%-change mean confidence interval includes 0). + +FWIW, that one HURT on the instructions count is for just one +instruction. +--- + src/broadcom/compiler/nir_to_vir.c | 39 +++ + src/broadcom/compiler/v3d_compiler.h | 16 +- + .../compiler/v3d_nir_lower_image_load_store.c | 246 +++++++++++++++++- + src/broadcom/compiler/vir.c | 2 +- + 4 files changed, 294 insertions(+), 9 deletions(-) + +diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c +index 90fe1d1e7f0..a8cf02dd386 100644 +--- a/src/broadcom/compiler/nir_to_vir.c ++++ b/src/broadcom/compiler/nir_to_vir.c +@@ -1689,6 +1689,22 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) + result = vir_VFPACK(c, src[0], src[1]); + break; + ++ case nir_op_vpack_v3d: ++ result = vir_VPACK(c, src[0], src[1]); ++ break; ++ ++ case nir_op_v11fpack_v3d: ++ result = vir_V11FPACK(c, src[0], src[1]); ++ break; ++ ++ case nir_op_v10pack_v3d: ++ result = vir_V10PACK(c, src[0], src[1]); ++ break; ++ ++ case nir_op_v8pack_v3d: ++ result = vir_V8PACK(c, src[0], src[1]); ++ break; ++ + case nir_op_unpack_half_2x16_split_x: + result = vir_FMOV(c, src[0]); + vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); +@@ -1719,6 +1735,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) + result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero)); + break; + } ++ case nir_op_vftounorm8_v3d: ++ result = vir_VFTOUNORM8(c, src[0]); ++ break; ++ ++ case nir_op_vftosnorm8_v3d: ++ result = vir_VFTOSNORM8(c, src[0]); ++ break; ++ ++ case nir_op_vftounorm10lo_v3d: ++ result = vir_VFTOUNORM10LO(c, src[0]); ++ break; ++ ++ case nir_op_vftounorm10hi_v3d: ++ result = vir_VFTOUNORM10HI(c, src[0]); ++ break; ++ ++ case nir_op_ftounorm16_v3d: ++ result = vir_FTOUNORM16(c, src[0]); ++ break; ++ ++ case nir_op_ftosnorm16_v3d: ++ result = vir_FTOSNORM16(c, src[0]); ++ break; + + default: + fprintf(stderr, "unknown NIR ALU inst: "); +diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h +index 36adf8830b5..425ab0cdf9d 100644 +--- a/src/broadcom/compiler/v3d_compiler.h ++++ b/src/broadcom/compiler/v3d_compiler.h +@@ -1186,7 +1186,7 @@ bool v3d_nir_lower_line_smooth(nir_shader *shader); + bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c); + bool v3d_nir_lower_scratch(nir_shader *s); + bool v3d_nir_lower_txf_ms(nir_shader *s); +-bool v3d_nir_lower_image_load_store(nir_shader *s); ++bool v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c); + bool v3d_nir_lower_load_store_bitsize(nir_shader *s); + + void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components); +@@ -1427,6 +1427,20 @@ VIR_SFU(LOG) + VIR_SFU(SIN) + VIR_SFU(RSQRT2) + ++VIR_A_ALU2(VPACK) ++VIR_A_ALU2(V8PACK) ++VIR_A_ALU2(V10PACK) ++VIR_A_ALU2(V11FPACK) ++ ++VIR_M_ALU1(FTOUNORM16) ++VIR_M_ALU1(FTOSNORM16) ++ ++VIR_M_ALU1(VFTOUNORM8) ++VIR_M_ALU1(VFTOSNORM8) ++ ++VIR_M_ALU1(VFTOUNORM10LO) ++VIR_M_ALU1(VFTOUNORM10HI) ++ + static inline struct qinst * + vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond, + struct qreg dest, struct qreg src) +diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c +index 2900a29817f..bbb55be4a14 100644 +--- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c ++++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c +@@ -40,6 +40,10 @@ + * calculations and load/store using the TMU general memory access path. + */ + ++static const unsigned bits_8[4] = {8, 8, 8, 8}; ++static const unsigned bits_16[4] = {16, 16, 16, 16}; ++static const unsigned bits_1010102[4] = {10, 10, 10, 2}; ++ + bool + v3d_gl_format_is_return_32(enum pipe_format format) + { +@@ -59,6 +63,8 @@ v3d_gl_format_is_return_32(enum pipe_format format) + + /* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a + * 32-bit SSA value, with as many channels as necessary to store all the bits ++ * ++ * This is the generic helper, using all common nir operations. + */ + static nir_ssa_def * + pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits, +@@ -91,8 +97,185 @@ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits, + return nir_vec(b, results, DIV_ROUND_UP(offset, 32)); + } + ++/* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is ++ * just easier to read vfpack on the code, specially while using the PRM as ++ * reference ++ */ ++static nir_ssa_def * ++nir_vfpack(nir_builder *b, nir_ssa_def *p1, nir_ssa_def *p2) ++{ ++ return nir_pack_half_2x16_split(b, p1, p2); ++} ++ ++static inline nir_ssa_def * ++pack_11f11f10f(nir_builder *b, nir_ssa_def *color) ++{ ++ nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0), ++ nir_channel(b, color, 1)); ++ /* FIXME: we noted that we could just use p2 again as the second ++ * element to pack, and CTS tests still works. Just using undef as is ++ * slightly more correct ++ */ ++ nir_ssa_def *undef = nir_ssa_undef(b, 1, color->bit_size); ++ nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef); ++ ++ return nir_v11fpack_v3d(b, p1, p2); ++} ++ ++static inline nir_ssa_def * ++pack_r10g10b10a2_uint(nir_builder *b, nir_ssa_def *color) ++{ ++ nir_ssa_def *p1 = nir_vpack_v3d(b, nir_channel(b, color, 0), ++ nir_channel(b, color, 1)); ++ nir_ssa_def *p2 = nir_vpack_v3d(b, nir_channel(b, color, 2), ++ nir_channel(b, color, 3)); ++ ++ return nir_v10pack_v3d(b, p1, p2); ++} ++ ++static inline nir_ssa_def * ++pack_r10g10b10a2_unorm(nir_builder *b, nir_ssa_def *color) ++{ ++ nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0), ++ nir_channel(b, color, 1)); ++ p1 = nir_vftounorm10lo_v3d(b, p1); ++ ++ nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), ++ nir_channel(b, color, 3)); ++ p2 = nir_vftounorm10hi_v3d(b, p2); ++ ++ return nir_v10pack_v3d(b, p1, p2); ++} ++ ++enum hw_conversion { ++ NONE, ++ TO_SNORM, ++ TO_UNORM ++}; ++ ++static inline nir_ssa_def * ++pack_8bit(nir_builder *b, nir_ssa_def *color, ++ unsigned num_components, ++ enum hw_conversion conversion) ++{ ++ /* Note that usually you should not use this method (that relies on ++ * custom packing) for 1 component if we are not doing any ++ * conversion. But we support also that case, and let the caller ++ * decide which method to use. ++ */ ++ nir_ssa_def *p1; ++ nir_ssa_def *p2; ++ ++ if (conversion == NONE) { ++ p1 = nir_vpack_v3d(b, nir_channel(b, color, 0), ++ nir_channel(b, color, num_components == 1 ? 0 : 1)); ++ } else { ++ p1 = nir_vfpack(b, nir_channel(b, color, 0), ++ nir_channel(b, color, num_components == 1 ? 0 : 1)); ++ p1 = (conversion == TO_UNORM) ? ++ nir_vftounorm8_v3d(b, p1) : nir_vftosnorm8_v3d(b, p1); ++ } ++ if (num_components == 4) { ++ if (conversion == NONE) { ++ p2 = nir_vpack_v3d(b, nir_channel(b, color, 2), ++ nir_channel(b, color, 3)); ++ } else { ++ p2 = nir_vfpack(b, nir_channel(b, color, 2), ++ nir_channel(b, color, 3)); ++ p2 = (conversion == TO_UNORM) ? ++ nir_vftounorm8_v3d(b, p2) : nir_vftosnorm8_v3d(b, p2); ++ } ++ } else { ++ /* As mentioned on the comment before, using an undef here ++ * would be more correct. But for this case we are getting ++ * worse values, and in fact even some worse instruction count ++ * with some CTS tests, so we just reuse the first packing ++ */ ++ p2 = p1; ++ } ++ ++ return nir_v8pack_v3d(b, p1, p2); ++} ++ ++static inline nir_ssa_def * ++pack_16bit(nir_builder *b, nir_ssa_def *color, ++ unsigned num_components, ++ enum hw_conversion conversion) ++{ ++ nir_ssa_def *results[2]; ++ nir_ssa_def *channels[4]; ++ ++ /* Note that usually you should not use this method (that relies on ++ * custom packing) if we are not doing any conversion. But we support ++ * also that case, and let the caller decide which method to use. ++ */ ++ ++ for (unsigned i = 0; i < num_components; i++) { ++ channels[i] = nir_channel(b, color, i); ++ switch (conversion) { ++ case TO_SNORM: ++ channels[i] = nir_ftosnorm16_v3d(b, channels[i]); ++ break; ++ case TO_UNORM: ++ channels[i] = nir_ftounorm16_v3d(b, channels[i]); ++ break; ++ default: ++ break; ++ } ++ } ++ ++ switch (num_components) { ++ case 1: ++ results[0] = channels[0]; ++ break; ++ case 4: ++ results[1] = nir_vpack_v3d(b, channels[2], channels[3]); ++ FALLTHROUGH; ++ case 2: ++ results[0] = nir_vpack_v3d(b, channels[0], channels[1]); ++ break; ++ } ++ ++ return nir_vec(b, results, DIV_ROUND_UP(num_components, 2)); ++} ++ ++static inline nir_ssa_def * ++pack_xbit(nir_builder *b, nir_ssa_def *color, ++ unsigned num_components, ++ const struct util_format_channel_description *r_chan) ++{ ++ bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED); ++ enum hw_conversion conversion = NONE; ++ if (r_chan->normalized) { ++ conversion = ++ (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM; ++ } ++ ++ switch (r_chan->size) { ++ case 8: ++ if (conversion == NONE && num_components < 2) ++ return pack_bits(b, color, bits_8, num_components, pack_mask); ++ else ++ return pack_8bit(b, color, num_components, conversion); ++ break; ++ case 16: ++ /* pack_mask implies that the generic packing method would ++ * need to include extra operations to handle negative values, ++ * so in that case, even without a conversion, it is better to ++ * use the packing using custom hw operations. ++ */ ++ if (conversion == NONE && !pack_mask) ++ return pack_bits(b, color, bits_16, num_components, pack_mask); ++ else ++ return pack_16bit(b, color, num_components, conversion); ++ break; ++ default: ++ unreachable("unrecognized bits"); ++ } ++} ++ + static bool +-v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) ++v3d_nir_lower_image_store_v42(nir_builder *b, nir_intrinsic_instr *instr) + { + enum pipe_format format = nir_intrinsic_format(instr); + assert(format != PIPE_FORMAT_NONE); +@@ -118,9 +301,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) + */ + formatted = color; + } else { +- static const unsigned bits_8[4] = {8, 8, 8, 8}; +- static const unsigned bits_16[4] = {16, 16, 16, 16}; +- static const unsigned bits_1010102[4] = {10, 10, 10, 2}; + const unsigned *bits; + + switch (r_chan->size) { +@@ -171,6 +351,52 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) + return true; + } + ++ ++static bool ++v3d_nir_lower_image_store_v71(nir_builder *b, nir_intrinsic_instr *instr) ++{ ++ enum pipe_format format = nir_intrinsic_format(instr); ++ assert(format != PIPE_FORMAT_NONE); ++ const struct util_format_description *desc = ++ util_format_description(format); ++ const struct util_format_channel_description *r_chan = &desc->channel[0]; ++ unsigned num_components = util_format_get_nr_components(format); ++ b->cursor = nir_before_instr(&instr->instr); ++ ++ nir_ssa_def *color = nir_channels(b, ++ nir_ssa_for_src(b, instr->src[3], 4), ++ (1 << num_components) - 1); ++ nir_ssa_def *formatted = NULL; ++ if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) { ++ formatted = nir_format_pack_r9g9b9e5(b, color); ++ } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) { ++ formatted = pack_11f11f10f(b, color); ++ } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) { ++ formatted = pack_r10g10b10a2_uint(b, color); ++ } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) { ++ formatted = pack_r10g10b10a2_unorm(b, color); ++ } else if (r_chan->size == 32) { ++ /* For 32-bit formats, we just have to move the vector ++ * across (possibly reducing the number of channels). ++ */ ++ formatted = color; ++ } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) { ++ assert(r_chan->size == 16); ++ formatted = nir_format_float_to_half(b, color); ++ formatted = pack_bits(b, formatted, bits_16, num_components, ++ false); ++ } else { ++ assert(r_chan->size == 8 || r_chan->size == 16); ++ formatted = pack_xbit(b, color, num_components, r_chan); ++ } ++ ++ nir_instr_rewrite_src(&instr->instr, &instr->src[3], ++ nir_src_for_ssa(formatted)); ++ instr->num_components = formatted->num_components; ++ ++ return true; ++} ++ + static bool + v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr) + { +@@ -215,11 +441,17 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b, + nir_intrinsic_instr *intr = + nir_instr_as_intrinsic(instr); + ++ struct v3d_compile *c = (struct v3d_compile *) _state; ++ + switch (intr->intrinsic) { + case nir_intrinsic_image_load: + return v3d_nir_lower_image_load(b, intr); + case nir_intrinsic_image_store: +- return v3d_nir_lower_image_store(b, intr); ++ if (c->devinfo->ver >= 71) ++ return v3d_nir_lower_image_store_v71(b, intr); ++ else ++ return v3d_nir_lower_image_store_v42(b, intr); ++ break; + default: + return false; + } +@@ -228,9 +460,9 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b, + } + + bool +-v3d_nir_lower_image_load_store(nir_shader *s) ++v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c) + { + return nir_shader_instructions_pass(s, v3d_nir_lower_image_load_store_cb, + nir_metadata_block_index | +- nir_metadata_dominance, NULL); ++ nir_metadata_dominance, c); + } +diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c +index aea113f050e..7612eed7130 100644 +--- a/src/broadcom/compiler/vir.c ++++ b/src/broadcom/compiler/vir.c +@@ -1576,7 +1576,7 @@ v3d_attempt_compile(struct v3d_compile *c) + + NIR_PASS(_, c->s, v3d_nir_lower_io, c); + NIR_PASS(_, c->s, v3d_nir_lower_txf_ms); +- NIR_PASS(_, c->s, v3d_nir_lower_image_load_store); ++ NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c); + + NIR_PASS(_, c->s, nir_opt_idiv_const, 8); + nir_lower_idiv_options idiv_options = { +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0068-broadcom-compiler-don-t-allocate-spill-base-to-rf0-i.patch b/projects/RPi/devices/RPi5/patches/mesa/0068-broadcom-compiler-don-t-allocate-spill-base-to-rf0-i.patch new file mode 100644 index 0000000000..1fe43abf8f --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0068-broadcom-compiler-don-t-allocate-spill-base-to-rf0-i.patch @@ -0,0 +1,68 @@ +From f6082e941a3454c8735df2ff2713ae49b3daa74f Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 18 Apr 2023 08:50:13 +0200 +Subject: [PATCH 068/142] broadcom/compiler: don't allocate spill base to rf0 + in V3D 7.x + +Otherwise it can be stomped by instructions doing implicit rf0 writes. +--- + src/broadcom/compiler/vir_register_allocate.c | 21 +++++++++++++++---- + 1 file changed, 17 insertions(+), 4 deletions(-) + +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index 440b093a636..121c9b2794f 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -582,7 +582,8 @@ interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end) + } + + static void +-v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) ++v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes, ++ int spill_temp) + { + c->spill_start_num_temps = c->num_temps; + c->spilling = true; +@@ -594,8 +595,20 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) + spill_offset = c->spill_size; + c->spill_size += V3D_CHANNELS * sizeof(uint32_t); + +- if (spill_offset == 0) ++ if (spill_offset == 0) { + v3d_setup_spill_base(c); ++ ++ /* Don't allocate our spill base to rf0 to avoid ++ * conflicts with instructions doing implicit writes ++ * to that register. ++ */ ++ if (!c->devinfo->has_accumulators) { ++ ra_add_node_interference( ++ c->g, ++ temp_to_node(c, c->spill_base.index), ++ implicit_rf_nodes[0]); ++ } ++ } + } + + struct qinst *last_thrsw = c->last_thrsw; +@@ -1346,7 +1359,7 @@ v3d_register_allocate(struct v3d_compile *c) + int node = v3d_choose_spill_node(c); + uint32_t temp = node_to_temp(c, node); + if (node != -1) { +- v3d_spill_reg(c, acc_nodes, temp); ++ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp); + continue; + } + } +@@ -1363,7 +1376,7 @@ v3d_register_allocate(struct v3d_compile *c) + enum temp_spill_type spill_type = + get_spill_type_for_temp(c, temp); + if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) { +- v3d_spill_reg(c, acc_nodes, temp); ++ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp); + if (c->spills + c->fills > c->max_tmu_spills) + goto spill_fail; + } else { +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0069-broadcom-compiler-improve-allocation-for-final-progr.patch b/projects/RPi/devices/RPi5/patches/mesa/0069-broadcom-compiler-improve-allocation-for-final-progr.patch new file mode 100644 index 0000000000..fb73352b1a --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0069-broadcom-compiler-improve-allocation-for-final-progr.patch @@ -0,0 +1,186 @@ +From 0e9577fbb18a026390f653ca22f5a98a69a5fe59 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 2 May 2023 10:12:37 +0200 +Subject: [PATCH 069/142] broadcom/compiler: improve allocation for final + program instructions + +The last 3 instructions can't use specific registers so flag all the +nodes for temps used in the last program instructions and try to +avoid assigning any of these. This may help us avoid injecting nops +for the last thread switch instruction. + +Because regisster allocation needs to happen before QPU scheduling +and instruction merging we can't tell exactly what the last 3 +instructions will be, so we do this for a few more instructions than +just 3. + +We only do this for fragment shaders because other shader stages +always end with VPM store instructions that take an small immediate +and therefore will never allow us to merge the final thread switch +earlier, so limiting allocation for these shaders will never improve +anything and might instead be detrimental. + +total instructions in shared programs: 11471389 -> 11464335 (-0.06%) +instructions in affected programs: 582908 -> 575854 (-1.21%) +helped: 4669 +HURT: 578 +Instructions are helped. + +total max-temps in shared programs: 2230497 -> 2230150 (-0.02%) +max-temps in affected programs: 5662 -> 5315 (-6.13%) +helped: 344 +HURT: 44 +Max-temps are helped. + +total sfu-stalls in shared programs: 18068 -> 18077 (0.05%) +sfu-stalls in affected programs: 264 -> 273 (3.41%) +helped: 37 +HURT: 48 +Inconclusive result (value mean confidence interval includes 0). + +total inst-and-stalls in shared programs: 11489457 -> 11482412 (-0.06%) +inst-and-stalls in affected programs: 585180 -> 578135 (-1.20%) +helped: 4659 +HURT: 588 +Inst-and-stalls are helped. + +total nops in shared programs: 301738 -> 298140 (-1.19%) +nops in affected programs: 14680 -> 11082 (-24.51%) +helped: 3252 +HURT: 108 +Nops are helped. +--- + src/broadcom/compiler/v3d_compiler.h | 1 + + src/broadcom/compiler/vir_register_allocate.c | 69 +++++++++++++++++-- + 2 files changed, 66 insertions(+), 4 deletions(-) + +diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h +index 425ab0cdf9d..2642d23b629 100644 +--- a/src/broadcom/compiler/v3d_compiler.h ++++ b/src/broadcom/compiler/v3d_compiler.h +@@ -613,6 +613,7 @@ struct v3d_ra_node_info { + struct { + uint32_t priority; + uint8_t class_bits; ++ bool is_program_end; + + /* V3D 7.x */ + bool is_ldunif_dst; +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index 121c9b2794f..495644bb557 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -385,6 +385,7 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits) + c->nodes.info[node].class_bits = class_bits; + c->nodes.info[node].priority = 0; + c->nodes.info[node].is_ldunif_dst = false; ++ c->nodes.info[node].is_program_end = false; + } + + /* The spill offset for this thread takes a bit of setup, so do it once at +@@ -929,6 +930,17 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, + return true; + } + ++ /* The last 3 instructions in a shader can't use some specific registers ++ * (usually early rf registers, depends on v3d version) so try to ++ * avoid allocating these to registers used by the last instructions ++ * in the shader. ++ */ ++ const uint32_t safe_rf_start = v3d_ra->devinfo->ver <= 42 ? 3 : 4; ++ if (v3d_ra->nodes->info[node].is_program_end && ++ v3d_ra->next_phys < safe_rf_start) { ++ v3d_ra->next_phys = safe_rf_start; ++ } ++ + for (int i = 0; i < PHYS_COUNT; i++) { + int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT; + int phys = v3d_ra->phys_index + phys_off; +@@ -1218,6 +1230,44 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, + } + } + ++static void ++flag_program_end_nodes(struct v3d_compile *c) ++{ ++ /* Only look for registers used in this many instructions */ ++ uint32_t last_set_count = 6; ++ ++ struct qblock *last_block = vir_exit_block(c); ++ list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) { ++ if (!inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) ++ continue; ++ ++ int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op); ++ for (int i = 0; i < num_src; i++) { ++ if (inst->src[i].file == QFILE_TEMP) { ++ int node = temp_to_node(c, inst->src[i].index); ++ c->nodes.info[node].is_program_end = true; ++ } ++ } ++ ++ num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op); ++ for (int i = 0; i < num_src; i++) { ++ if (inst->src[i].file == QFILE_TEMP) { ++ int node = temp_to_node(c, inst->src[i].index); ++ c->nodes.info[node].is_program_end = true; ++ ++ } ++ } ++ ++ if (inst->dst.file == QFILE_TEMP) { ++ int node = temp_to_node(c, inst->dst.index); ++ c->nodes.info[node].is_program_end = true; ++ } ++ ++ if (--last_set_count == 0) ++ break; ++ } ++} ++ + /** + * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. + * +@@ -1280,17 +1330,16 @@ v3d_register_allocate(struct v3d_compile *c) + */ + for (uint32_t i = 0; i < num_ra_nodes; i++) { + c->nodes.info[i].is_ldunif_dst = false; ++ c->nodes.info[i].is_program_end = false; ++ c->nodes.info[i].priority = 0; ++ c->nodes.info[i].class_bits = 0; + if (c->devinfo->has_accumulators && i < ACC_COUNT) { + acc_nodes[i] = i; + ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i); +- c->nodes.info[i].priority = 0; +- c->nodes.info[i].class_bits = 0; + } else if (!c->devinfo->has_accumulators && + i < ARRAY_SIZE(implicit_rf_nodes)) { + implicit_rf_nodes[i] = i; + ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i); +- c->nodes.info[i].priority = 0; +- c->nodes.info[i].class_bits = 0; + } else { + uint32_t t = node_to_temp(c, i); + c->nodes.info[i].priority = +@@ -1327,6 +1376,18 @@ v3d_register_allocate(struct v3d_compile *c) + last_ldvary_ip, inst); + } + ++ /* Flag the nodes that are used in the last instructions of the program ++ * (there are some registers that cannot be used in the last 3 ++ * instructions). We only do this for fragment shaders, because the idea ++ * is that by avoiding this conflict we may be able to emit the last ++ * thread switch earlier in some cases, however, in non-fragment shaders ++ * this won't happen because the last instructions are always VPM stores ++ * with a small immediate, which conflicts with other signals, ++ * preventing us from ever moving the thrsw earlier. ++ */ ++ if (c->s->info.stage == MESA_SHADER_FRAGMENT) ++ flag_program_end_nodes(c); ++ + /* Set the register classes for all our temporaries in the graph */ + for (uint32_t i = 0; i < c->num_temps; i++) { + ra_set_node_class(c->g, temp_to_node(c, i), +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0070-broadcom-compiler-don-t-assign-registers-to-unused-n.patch b/projects/RPi/devices/RPi5/patches/mesa/0070-broadcom-compiler-don-t-assign-registers-to-unused-n.patch new file mode 100644 index 0000000000..1b29439b82 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0070-broadcom-compiler-don-t-assign-registers-to-unused-n.patch @@ -0,0 +1,105 @@ +From 645fe451bcecbe3345a144222306d06fb39f6b9f Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 2 May 2023 10:17:47 +0200 +Subject: [PATCH 070/142] broadcom/compiler: don't assign registers to unused + nodes/temps + +In programs with a lot of unused temps, if we don't do this, we may +end up recycling previously used rfs more often, which can be +detrimental to instruction pairing. + +total instructions in shared programs: 11464335 -> 11444136 (-0.18%) +instructions in affected programs: 8976743 -> 8956544 (-0.23%) +helped: 33196 +HURT: 33778 +Inconclusive result + +total max-temps in shared programs: 2230150 -> 2229445 (-0.03%) +max-temps in affected programs: 86413 -> 85708 (-0.82%) +helped: 2217 +HURT: 1523 +Max-temps are helped. + +total sfu-stalls in shared programs: 18077 -> 17104 (-5.38%) +sfu-stalls in affected programs: 8669 -> 7696 (-11.22%) +helped: 2657 +HURT: 2182 +Sfu-stalls are helped. + +total inst-and-stalls in shared programs: 11482412 -> 11461240 (-0.18%) +inst-and-stalls in affected programs: 8995697 -> 8974525 (-0.24%) +helped: 33319 +HURT: 33708 +Inconclusive result + +total nops in shared programs: 298140 -> 296185 (-0.66%) +nops in affected programs: 52805 -> 50850 (-3.70%) +helped: 3797 +HURT: 2662 +Inconclusive result +--- + src/broadcom/compiler/v3d_compiler.h | 1 + + src/broadcom/compiler/vir_register_allocate.c | 14 ++++++++++++++ + 2 files changed, 15 insertions(+) + +diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h +index 2642d23b629..f1a807e38fd 100644 +--- a/src/broadcom/compiler/v3d_compiler.h ++++ b/src/broadcom/compiler/v3d_compiler.h +@@ -614,6 +614,7 @@ struct v3d_ra_node_info { + uint32_t priority; + uint8_t class_bits; + bool is_program_end; ++ bool unused; + + /* V3D 7.x */ + bool is_ldunif_dst; +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index 495644bb557..0ab0474424f 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -386,6 +386,7 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits) + c->nodes.info[node].priority = 0; + c->nodes.info[node].is_ldunif_dst = false; + c->nodes.info[node].is_program_end = false; ++ c->nodes.info[node].unused = false; + } + + /* The spill offset for this thread takes a bit of setup, so do it once at +@@ -918,6 +919,12 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, + BITSET_WORD *regs, + unsigned int *out) + { ++ /* If this node is for an unused temp, ignore. */ ++ if (v3d_ra->nodes->info[node].unused) { ++ *out = 0; ++ return true; ++ } ++ + /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst + * so we can avoid turning them into ldunifrf (which uses the + * cond field to encode the dst and would prevent merge with +@@ -1331,6 +1338,7 @@ v3d_register_allocate(struct v3d_compile *c) + for (uint32_t i = 0; i < num_ra_nodes; i++) { + c->nodes.info[i].is_ldunif_dst = false; + c->nodes.info[i].is_program_end = false; ++ c->nodes.info[i].unused = false; + c->nodes.info[i].priority = 0; + c->nodes.info[i].class_bits = 0; + if (c->devinfo->has_accumulators && i < ACC_COUNT) { +@@ -1396,6 +1404,12 @@ v3d_register_allocate(struct v3d_compile *c) + + /* Add register interferences based on liveness data */ + for (uint32_t i = 0; i < c->num_temps; i++) { ++ /* And while we are here, let's also flag nodes for ++ * unused temps. ++ */ ++ if (c->temp_start[i] > c->temp_end[i]) ++ c->nodes.info[temp_to_node(c, i)].unused = true; ++ + for (uint32_t j = i + 1; j < c->num_temps; j++) { + if (interferes(c->temp_start[i], c->temp_end[i], + c->temp_start[j], c->temp_end[j])) { +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0071-broadcom-compiler-only-assign-rf0-as-last-resort-in-.patch b/projects/RPi/devices/RPi5/patches/mesa/0071-broadcom-compiler-only-assign-rf0-as-last-resort-in-.patch new file mode 100644 index 0000000000..1ff6366faa --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0071-broadcom-compiler-only-assign-rf0-as-last-resort-in-.patch @@ -0,0 +1,83 @@ +From 851704169d59e28c5429b06d05e5ef952be893a2 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Mon, 15 May 2023 10:02:10 +0200 +Subject: [PATCH 071/142] broadcom/compiler: only assign rf0 as last resort in + V3D 7.x + +So we can use it for ldunif(a) and avoid generating ldunif(a)rf which +can't be paired with conditional instructions. + +shader-db (pi5): + +total instructions in shared programs: 11357802 -> 11338883 (-0.17%) +instructions in affected programs: 7117889 -> 7098970 (-0.27%) +helped: 24264 +HURT: 17574 +Instructions are helped. + +total uniforms in shared programs: 3857808 -> 3857815 (<.01%) +uniforms in affected programs: 92 -> 99 (7.61%) +helped: 0 +HURT: 1 + +total max-temps in shared programs: 2230904 -> 2230199 (-0.03%) +max-temps in affected programs: 52309 -> 51604 (-1.35%) +helped: 1219 +HURT: 725 +Max-temps are helped. + +total sfu-stalls in shared programs: 15021 -> 15236 (1.43%) +sfu-stalls in affected programs: 6848 -> 7063 (3.14%) +helped: 1866 +HURT: 1704 +Inconclusive result + +total inst-and-stalls in shared programs: 11372823 -> 11354119 (-0.16%) +inst-and-stalls in affected programs: 7149177 -> 7130473 (-0.26%) +helped: 24315 +HURT: 17561 +Inst-and-stalls are helped. + +total nops in shared programs: 273624 -> 273711 (0.03%) +nops in affected programs: 31562 -> 31649 (0.28%) +helped: 1619 +HURT: 1854 +Inconclusive result (value mean confidence interval includes 0). +--- + src/broadcom/compiler/vir_register_allocate.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index 0ab0474424f..8eac2b75bd7 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -950,6 +950,11 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, + + for (int i = 0; i < PHYS_COUNT; i++) { + int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT; ++ ++ /* Try to keep rf0 available for ldunif in 7.x (see above). */ ++ if (v3d_ra->devinfo->ver >= 71 && phys_off == 0) ++ continue; ++ + int phys = v3d_ra->phys_index + phys_off; + + if (BITSET_TEST(regs, phys)) { +@@ -959,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, + } + } + ++ /* If we couldn't allocate, do try to assign rf0 if it is available. */ ++ if (v3d_ra->devinfo->ver >= 71 && ++ BITSET_TEST(regs, v3d_ra->phys_index)) { ++ v3d_ra->next_phys = 1; ++ *out = v3d_ra->phys_index; ++ return true; ++ } ++ + return false; + } + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0072-v3dv-recover-non-conformant-warning-for-not-fully-su.patch b/projects/RPi/devices/RPi5/patches/mesa/0072-v3dv-recover-non-conformant-warning-for-not-fully-su.patch new file mode 100644 index 0000000000..2fcd20415f --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0072-v3dv-recover-non-conformant-warning-for-not-fully-su.patch @@ -0,0 +1,30 @@ +From 0d3fd30d67ffc0195b0783e30ab6afbbe403310a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 28 Apr 2021 14:31:38 +0200 +Subject: [PATCH 072/142] v3dv: recover non-conformant warning for not fully + supported hw + +--- + src/broadcom/vulkan/v3dv_device.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c +index d5de3517670..d29ffad3531 100644 +--- a/src/broadcom/vulkan/v3dv_device.c ++++ b/src/broadcom/vulkan/v3dv_device.c +@@ -1212,6 +1212,12 @@ create_physical_device(struct v3dv_instance *instance, + + list_addtail(&device->vk.link, &instance->vk.physical_devices.list); + ++ if (device->devinfo.ver != 42) { ++ fprintf(stderr, "WARNING: v3dv support for hw version %i is neither " ++ "a complete nor a conformant Vulkan implementation. Testing " ++ "use only.\n", device->devinfo.ver); ++ } ++ + return VK_SUCCESS; + + fail: +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0073-v3dv-meson-add-v71-hw-generation.patch b/projects/RPi/devices/RPi5/patches/mesa/0073-v3dv-meson-add-v71-hw-generation.patch new file mode 100644 index 0000000000..8023c45736 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0073-v3dv-meson-add-v71-hw-generation.patch @@ -0,0 +1,504 @@ +From 52b5ac62b367ae89574c8031fdcf7c1dae05c942 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Tue, 29 Jun 2021 11:59:53 +0200 +Subject: [PATCH 073/142] v3dv/meson: add v71 hw generation + +Starting point for v71 version inclusion. + +This just adds it as one of the versions to be compiled (on meson), +updates the v3dX/v3dv_X macros, and update the code enough to get it +compiling when building using the two versions. For any packet not +available on v71 we just provide a generic asserted placeholder of +generation not supported. + +Any real v71 support will be implemented on following commits. +--- + src/broadcom/vulkan/meson.build | 6 +- + src/broadcom/vulkan/v3dv_private.h | 7 +++ + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 75 +++++++++++++++++++++++-- + src/broadcom/vulkan/v3dvx_image.c | 16 +++++- + src/broadcom/vulkan/v3dvx_meta_common.c | 32 +++++++++++ + src/broadcom/vulkan/v3dvx_pipeline.c | 5 ++ + src/broadcom/vulkan/v3dvx_queue.c | 11 ++++ + 7 files changed, 142 insertions(+), 10 deletions(-) + +diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build +index ad032d832ad..3da7364686f 100644 +--- a/src/broadcom/vulkan/meson.build ++++ b/src/broadcom/vulkan/meson.build +@@ -27,6 +27,7 @@ v3dv_entrypoints = custom_target( + '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv', + '--beta', with_vulkan_beta.to_string(), + '--device-prefix', 'ver42', ++ '--device-prefix', 'ver71', + ], + depend_files : vk_entrypoints_gen_depend_files, + ) +@@ -67,10 +68,7 @@ files_per_version = files( + 'v3dvx_queue.c', + ) + +-# The vulkan driver only supports version >= 42, which is the version present in +-# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d +-# driver. +-v3d_versions = ['42'] ++v3d_versions = ['42', '71'] + + v3dv_flags = [] + +diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h +index c6707211529..6bdf338c67b 100644 +--- a/src/broadcom/vulkan/v3dv_private.h ++++ b/src/broadcom/vulkan/v3dv_private.h +@@ -2608,6 +2608,9 @@ u64_compare(const void *key1, const void *key2) + case 42: \ + v3d_X_thing = &v3d42_##thing; \ + break; \ ++ case 71: \ ++ v3d_X_thing = &v3d71_##thing; \ ++ break; \ + default: \ + unreachable("Unsupported hardware generation"); \ + } \ +@@ -2626,6 +2629,10 @@ u64_compare(const void *key1, const void *key2) + # define v3dX(x) v3d42_##x + # include "v3dvx_private.h" + # undef v3dX ++ ++# define v3dX(x) v3d71_##x ++# include "v3dvx_private.h" ++# undef v3dX + #endif + + #ifdef ANDROID +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index f182b790d36..b958e634c82 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -56,10 +56,15 @@ v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job) + }; + config.width_in_pixels = tiling->width; + config.height_in_pixels = tiling->height; ++#if V3D_VERSION == 42 + config.number_of_render_targets = MAX2(tiling->render_target_count, 1); + config.multisample_mode_4x = tiling->msaa; + config.double_buffer_in_non_ms_mode = tiling->double_buffer; + config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + + uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr; + cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config); +@@ -82,10 +87,15 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job, + cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { + config.width_in_pixels = tiling->width; + config.height_in_pixels = tiling->height; ++#if V3D_VERSION == 42 + config.number_of_render_targets = MAX2(tiling->render_target_count, 1); + config.multisample_mode_4x = tiling->msaa; + config.double_buffer_in_non_ms_mode = tiling->double_buffer; + config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + } + + /* There's definitely nothing in the VCD cache we want. */ +@@ -649,10 +659,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, + * bit and instead we have to emit a single clear of all tile buffers. + */ + if (use_global_zs_clear || use_global_rt_clear) { ++#if V3D_VERSION == 42 + cl_emit(cl, CLEAR_TILE_BUFFERS, clear) { + clear.clear_z_stencil_buffer = use_global_zs_clear; + clear.clear_all_render_targets = use_global_rt_clear; + } ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("Hardware generation 71 not supported yet."); ++#endif + } + } + +@@ -824,7 +839,12 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + config.number_of_render_targets = MAX2(subpass->color_count, 1); + config.multisample_mode_4x = tiling->msaa; + config.double_buffer_in_non_ms_mode = tiling->double_buffer; ++#if V3D_VERSION == 42 + config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + + if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { + const struct v3dv_image_view *iview = +@@ -920,7 +940,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + const struct v3d_resource_slice *slice = + &image->planes[plane].slices[iview->vk.base_mip_level]; + +- const uint32_t *clear_color = ++ UNUSED const uint32_t *clear_color = + &state->attachments[attachment_idx].clear_value.color[0]; + + uint32_t clear_pad = 0; +@@ -937,13 +957,19 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + } + } + ++#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) { + clear.clear_color_low_32_bits = clear_color[0]; + clear.clear_color_next_24_bits = clear_color[1] & 0xffffff; + clear.render_target_number = i; + }; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + + if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) { ++#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) { + clear.clear_color_mid_low_32_bits = + ((clear_color[1] >> 24) | (clear_color[2] << 8)); +@@ -951,17 +977,28 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + ((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8)); + clear.render_target_number = i; + }; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif ++ + } + + if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) { ++#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) { + clear.uif_padded_height_in_uif_blocks = clear_pad; + clear.clear_color_high_16_bits = clear_color[3] >> 16; + clear.render_target_number = i; + }; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + } + } + ++#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { + v3dX(cmd_buffer_render_pass_setup_render_target) + (cmd_buffer, 0, &rt.render_target_0_internal_bpp, +@@ -976,6 +1013,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + (cmd_buffer, 3, &rt.render_target_3_internal_bpp, + &rt.render_target_3_internal_type, &rt.render_target_3_clamp); + } ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("Hardware generation 71 not supported yet."); ++#endif + + /* Ends rendering mode config. */ + if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { +@@ -1036,10 +1077,15 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + } + if (cmd_buffer->state.tile_aligned_render_area && + (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) { ++#if V3D_VERSION == 42 + cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) { + clear.clear_z_stencil_buffer = !job->early_zs_clear; + clear.clear_all_render_targets = true; + } ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + } + cl_emit(rcl, END_OF_TILE_MARKER, end); + } +@@ -1065,7 +1111,9 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) + * now, would need to change if we allow multiple viewports + */ + float *vptranslate = dynamic->viewport.translate[0]; ++#if V3D_VERSION == 42 + float *vpscale = dynamic->viewport.scale[0]; ++#endif + + struct v3dv_job *job = cmd_buffer->state.job; + assert(job); +@@ -1078,10 +1126,15 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) + v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size); + v3dv_return_if_oom(cmd_buffer, NULL); + ++#if V3D_VERSION == 42 + cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { + clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f; + clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f; + } ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + + float translate_z, scale_z; + v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0, +@@ -1591,16 +1644,20 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer) + struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + assert(pipeline); + +- bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer); +- + v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS)); + v3dv_return_if_oom(cmd_buffer, NULL); + ++#if V3D_VERSION == 42 ++ bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer); + cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) { + config.early_z_enable = enable_ez; + config.early_z_updates_enable = config.early_z_enable && + pipeline->z_updates_enable; + } ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + } + + void +@@ -2031,10 +2088,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer) + pipeline->vpm_cfg.Gv); + } + ++#if V3D_VERSION == 42 + struct v3dv_bo *default_attribute_values = + pipeline->default_attribute_values != NULL ? + pipeline->default_attribute_values : + pipeline->device->default_attribute_float; ++#endif + + cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD, + pipeline->shader_state_record, shader) { +@@ -2060,8 +2119,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer) + shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs; + shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs; + ++#if V3D_VERSION == 42 + shader.address_of_default_attribute_values = + v3dv_cl_address(default_attribute_values, 0); ++#endif + + shader.any_shader_reads_hardware_written_primitive_id = + (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid; +@@ -2399,11 +2460,17 @@ v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buf + + assert(iview->plane_count == 1); + *rt_bpp = iview->planes[0].internal_bpp; +- *rt_type = iview->planes[0].internal_type; + if (vk_format_is_int(iview->vk.view_format)) ++#if V3D_VERSION == 42 ++ *rt_type = iview->planes[0].internal_type; ++ if (vk_format_is_int(iview->vk.format)) + *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT; + else if (vk_format_is_srgb(iview->vk.view_format)) + *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM; + else + *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + } +diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c +index 80a3e5bfde8..dac6ff2741f 100644 +--- a/src/broadcom/vulkan/v3dvx_image.c ++++ b/src/broadcom/vulkan/v3dvx_image.c +@@ -76,8 +76,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device, + tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]); + tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]); + +- tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse; +- + tex.texture_type = image_view->format->planes[plane].tex_type; + + if (image->vk.image_type == VK_IMAGE_TYPE_3D) { +@@ -110,7 +108,16 @@ pack_texture_shader_state_helper(struct v3dv_device *device, + + tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64; + ++#if V3D_VERSION == 42 ++ tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse; ++#endif ++ ++#if V3D_VERSION == 42 + tex.srgb = vk_format_is_srgb(image_view->vk.view_format); ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("Hardware generation 71 not supported yet."); ++#endif + + /* At this point we don't have the job. That's the reason the first + * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to +@@ -166,7 +173,12 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device, + + assert(buffer_view->format->plane_count == 1); + tex.texture_type = buffer_view->format->planes[0].tex_type; ++#if V3D_VERSION == 42 + tex.srgb = vk_format_is_srgb(buffer_view->vk_format); ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("Hardware generation 71 not supported yet."); ++#endif + + /* At this point we don't have the job. That's the reason the first + * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to +diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c +index 04147b82cbd..2db07ea7427 100644 +--- a/src/broadcom/vulkan/v3dvx_meta_common.c ++++ b/src/broadcom/vulkan/v3dvx_meta_common.c +@@ -58,7 +58,12 @@ emit_rcl_prologue(struct v3dv_job *job, + config.number_of_render_targets = 1; + config.multisample_mode_4x = tiling->msaa; + config.double_buffer_in_non_ms_mode = tiling->double_buffer; ++#if V3D_VERSION == 42 + config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("Hardware generation 71 not supported yet."); ++#endif + config.internal_depth_type = fb->internal_depth_type; + } + +@@ -88,14 +93,20 @@ emit_rcl_prologue(struct v3dv_job *job, + } + } + ++#if V3D_VERSION == 42 + const uint32_t *color = &clear_info->clear_value->color[0]; + cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) { + clear.clear_color_low_32_bits = color[0]; + clear.clear_color_next_24_bits = color[1] & 0x00ffffff; + clear.render_target_number = 0; + }; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("Hardware generation 71 not supported yet."); ++#endif + + if (tiling->internal_bpp >= V3D_INTERNAL_BPP_64) { ++#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) { + clear.clear_color_mid_low_32_bits = + ((color[1] >> 24) | (color[2] << 8)); +@@ -103,22 +114,37 @@ emit_rcl_prologue(struct v3dv_job *job, + ((color[2] >> 24) | ((color[3] & 0xffff) << 8)); + clear.render_target_number = 0; + }; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("Hardware generation 71 not supported yet."); ++#endif ++ + } + + if (tiling->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) { ++#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) { + clear.uif_padded_height_in_uif_blocks = clear_pad; + clear.clear_color_high_16_bits = color[3] >> 16; + clear.render_target_number = 0; + }; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("Hardware generation 71 not supported yet."); ++#endif + } + } + ++#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { + rt.render_target_0_internal_bpp = tiling->internal_bpp; + rt.render_target_0_internal_type = fb->internal_type; + rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE; + } ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("Hardware generation 71 not supported yet."); ++#endif + + cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { + clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f; +@@ -179,10 +205,16 @@ emit_frame_setup(struct v3dv_job *job, + */ + if (clear_value && + (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) { ++#if V3D_VERSION == 42 + cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) { + clear.clear_z_stencil_buffer = true; + clear.clear_all_render_targets = true; + } ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("Hardware generation 71 not supported yet."); ++#endif ++ + } + cl_emit(rcl, END_OF_TILE_MARKER, end); + } +diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c +index 5d32d414ed8..922698b08a2 100644 +--- a/src/broadcom/vulkan/v3dvx_pipeline.c ++++ b/src/broadcom/vulkan/v3dvx_pipeline.c +@@ -447,10 +447,15 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) + /* FIXME: Use combined input/output size flag in the common case (also + * on v3d, see v3dx_draw). + */ ++#if V3D_VERSION == 42 + shader.coordinate_shader_has_separate_input_and_output_vpm_blocks = + prog_data_vs_bin->separate_segments; + shader.vertex_shader_has_separate_input_and_output_vpm_blocks = + prog_data_vs->separate_segments; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + + shader.coordinate_shader_input_vpm_segment_size = + prog_data_vs_bin->separate_segments ? +diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c +index efe63de425c..1a26d04aef7 100644 +--- a/src/broadcom/vulkan/v3dvx_queue.c ++++ b/src/broadcom/vulkan/v3dvx_queue.c +@@ -42,14 +42,25 @@ v3dX(job_emit_noop)(struct v3dv_job *job) + config.image_height_pixels = 1; + config.number_of_render_targets = 1; + config.multisample_mode_4x = false; ++#if V3D_VERSION == 42 + config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + } + ++#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { + rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32; + rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8; + rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE; + } ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("Hardware generation 71 not supported yet."); ++#endif ++ + + cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { + clear.z_clear_value = 1.0f; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0074-v3dv-expose-V3D-revision-number-in-device-name.patch b/projects/RPi/devices/RPi5/patches/mesa/0074-v3dv-expose-V3D-revision-number-in-device-name.patch new file mode 100644 index 0000000000..3b3626dda1 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0074-v3dv-expose-V3D-revision-number-in-device-name.patch @@ -0,0 +1,29 @@ +From 7aa016bca8bb1bf449ea79505692353c0bd174b8 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 10 Nov 2021 10:06:50 +0100 +Subject: [PATCH 074/142] v3dv: expose V3D revision number in device name + +--- + src/broadcom/vulkan/v3dv_device.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c +index d29ffad3531..3034b561480 100644 +--- a/src/broadcom/vulkan/v3dv_device.c ++++ b/src/broadcom/vulkan/v3dv_device.c +@@ -1123,8 +1123,10 @@ create_physical_device(struct v3dv_instance *instance, + device->next_program_id = 0; + + ASSERTED int len = +- asprintf(&device->name, "V3D %d.%d", +- device->devinfo.ver / 10, device->devinfo.ver % 10); ++ asprintf(&device->name, "V3D %d.%d.%d", ++ device->devinfo.ver / 10, ++ device->devinfo.ver % 10, ++ device->devinfo.rev); + assert(len != -1); + + v3dv_physical_device_init_disk_cache(device); +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0075-v3dv-device-handle-new-rpi5-device-bcm2712.patch b/projects/RPi/devices/RPi5/patches/mesa/0075-v3dv-device-handle-new-rpi5-device-bcm2712.patch new file mode 100644 index 0000000000..249a11c141 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0075-v3dv-device-handle-new-rpi5-device-bcm2712.patch @@ -0,0 +1,54 @@ +From fb9e95b7e1d5987fd25e914635c4e09d81ea9561 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 10 Nov 2021 07:54:35 +0100 +Subject: [PATCH 075/142] v3dv/device: handle new rpi5 device (bcm2712) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This includes both master and primary devices. + +Signed-off-by: Iago Toral Quiroga +Signed-off-by: Alejandro Piñeiro +--- + src/broadcom/vulkan/v3dv_device.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c +index 3034b561480..c8719d33f15 100644 +--- a/src/broadcom/vulkan/v3dv_device.c ++++ b/src/broadcom/vulkan/v3dv_device.c +@@ -1287,7 +1287,8 @@ enumerate_devices(struct vk_instance *vk_instance) + if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) { + char **compat = devices[i]->deviceinfo.platform->compatible; + while (*compat) { +- if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) { ++ if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 || ++ strncmp(*compat, "brcm,2712-v3d", 13) == 0) { + v3d_idx = i; + break; + } +@@ -1296,8 +1297,9 @@ enumerate_devices(struct vk_instance *vk_instance) + } else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) { + char **compat = devices[i]->deviceinfo.platform->compatible; + while (*compat) { +- if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 || +- strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) { ++ if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 || ++ strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 || ++ strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) { + vc4_idx = i; + break; + } +@@ -1334,6 +1336,8 @@ v3dv_physical_device_device_id(struct v3dv_physical_device *dev) + switch (dev->devinfo.ver) { + case 42: + return 0xBE485FD3; /* Broadcom deviceID for 2711 */ ++ case 71: ++ return 0x55701C33; /* Broadcom deviceID for 2712 */ + default: + unreachable("Unsupported V3D version"); + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0076-v3dv-cmd_buffer-emit-TILE_BINNING_MODE_CFG-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0076-v3dv-cmd_buffer-emit-TILE_BINNING_MODE_CFG-for-v71.patch new file mode 100644 index 0000000000..70419bad10 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0076-v3dv-cmd_buffer-emit-TILE_BINNING_MODE_CFG-for-v71.patch @@ -0,0 +1,32 @@ +From c4f957af4fb0e10abf0a7ffad4f7a468633b7d99 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Tue, 20 Jul 2021 14:00:44 +0200 +Subject: [PATCH 076/142] v3dv/cmd_buffer: emit TILE_BINNING_MODE_CFG for v71 + +--- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index b958e634c82..17b2f46850d 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -94,7 +94,14 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job, + config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; + #endif + #if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); ++ config.log2_tile_width = log2_tile_size(tiling->tile_width); ++ config.log2_tile_height = log2_tile_size(tiling->tile_height); ++ /* FIXME: ideally we would like next assert on the packet header (as is ++ * general, so also applies to GL). We would need to expand ++ * gen_pack_header for that. ++ */ ++ assert(config.log2_tile_width == config.log2_tile_height || ++ config.log2_tile_width == config.log2_tile_height + 1); + #endif + } + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0077-v3dv-emit-TILE_RENDERING_MODE_CFG_COMMON-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0077-v3dv-emit-TILE_RENDERING_MODE_CFG_COMMON-for-v71.patch new file mode 100644 index 0000000000..7a6e9ec2a1 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0077-v3dv-emit-TILE_RENDERING_MODE_CFG_COMMON-for-v71.patch @@ -0,0 +1,53 @@ +From 1934ac07df73cb685f6550b8b0f5b4f2ead11396 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Tue, 20 Jul 2021 14:33:00 +0200 +Subject: [PATCH 077/142] v3dv: emit TILE_RENDERING_MODE_CFG_COMMON for v71 + +--- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 9 ++++++++- + src/broadcom/vulkan/v3dvx_meta_common.c | 9 ++++++++- + 2 files changed, 16 insertions(+), 2 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index 17b2f46850d..7837b460051 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -850,7 +850,14 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; + #endif + #if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); ++ config.log2_tile_width = log2_tile_size(tiling->tile_width); ++ config.log2_tile_height = log2_tile_size(tiling->tile_height); ++ /* FIXME: ideallly we would like next assert on the packet header (as is ++ * general, so also applies to GL). We would need to expand ++ * gen_pack_header for that. ++ */ ++ assert(config.log2_tile_width == config.log2_tile_height || ++ config.log2_tile_width == config.log2_tile_height + 1); + #endif + + if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { +diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c +index 2db07ea7427..e4084d851fc 100644 +--- a/src/broadcom/vulkan/v3dvx_meta_common.c ++++ b/src/broadcom/vulkan/v3dvx_meta_common.c +@@ -62,7 +62,14 @@ emit_rcl_prologue(struct v3dv_job *job, + config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; + #endif + #if V3D_VERSION >= 71 +- unreachable("Hardware generation 71 not supported yet."); ++ config.log2_tile_width = log2_tile_size(tiling->tile_width); ++ config.log2_tile_height = log2_tile_size(tiling->tile_height); ++ /* FIXME: ideallly we would like next assert on the packet header (as is ++ * general, so also applies to GL). We would need to expand ++ * gen_pack_header for that. ++ */ ++ assert(config.log2_tile_width == config.log2_tile_height || ++ config.log2_tile_width == config.log2_tile_height + 1); + #endif + config.internal_depth_type = fb->internal_depth_type; + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0078-v3dv-cmd_buffer-emit-TILE_RENDERING_MODE_CFG_RENDER_.patch b/projects/RPi/devices/RPi5/patches/mesa/0078-v3dv-cmd_buffer-emit-TILE_RENDERING_MODE_CFG_RENDER_.patch new file mode 100644 index 0000000000..9c0a0a5ced --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0078-v3dv-cmd_buffer-emit-TILE_RENDERING_MODE_CFG_RENDER_.patch @@ -0,0 +1,315 @@ +From f0f9eea3cad83ed8824c6a7686150327407a5286 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 22 Jul 2021 14:26:13 +0200 +Subject: [PATCH 078/142] v3dv/cmd_buffer: emit + TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1 for v71 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: Alejandro Piñeiro +Signed-off-by: Iago Toral Quiroga +--- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 186 +++++++++++++++++------- + src/broadcom/vulkan/v3dvx_meta_common.c | 12 +- + src/broadcom/vulkan/v3dvx_private.h | 11 +- + 3 files changed, 147 insertions(+), 62 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index 7837b460051..c6307890da5 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -800,6 +800,103 @@ set_rcl_early_z_config(struct v3dv_job *job, + } + } + ++/* Note that for v71, render target cfg packets has just one field that ++ * combined the internal type and clamp mode. For simplicity we keep just one ++ * helper. ++ * ++ * Note: rt_type is in fact a "enum V3DX(Internal_Type)". ++ * ++ * FIXME: for v71 we are not returning all the possible combinations for ++ * render target internal type and clamp. For example for int types we are ++ * always using clamp int, and for 16f we are using clamp none or pos (that ++ * seem the equivalent for no-clamp on 4.2), but not pq or hlg. In summary ++ * right now we are just porting what we were doing on 4.2 ++ */ ++uint32_t ++v3dX(clamp_for_format_and_type)(uint32_t rt_type, ++ VkFormat vk_format) ++{ ++#if V3D_VERSION == 42 ++ if (vk_format_is_int(vk_format)) ++ return V3D_RENDER_TARGET_CLAMP_INT; ++ else if (vk_format_is_srgb(vk_format)) ++ return V3D_RENDER_TARGET_CLAMP_NORM; ++ else ++ return V3D_RENDER_TARGET_CLAMP_NONE; ++#endif ++#if V3D_VERSION >= 71 ++ switch (rt_type) { ++ case V3D_INTERNAL_TYPE_8I: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED; ++ case V3D_INTERNAL_TYPE_8UI: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED; ++ case V3D_INTERNAL_TYPE_8: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_8; ++ case V3D_INTERNAL_TYPE_16I: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED; ++ case V3D_INTERNAL_TYPE_16UI: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED; ++ case V3D_INTERNAL_TYPE_16F: ++ return vk_format_is_srgb(vk_format) ? ++ V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM : ++ V3D_RENDER_TARGET_TYPE_CLAMP_16F; ++ case V3D_INTERNAL_TYPE_32I: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED; ++ case V3D_INTERNAL_TYPE_32UI: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED; ++ case V3D_INTERNAL_TYPE_32F: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_32F; ++ default: ++ unreachable("Unknown internal render target type"); ++ } ++ ++ return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID; ++#endif ++} ++ ++static void ++cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer, ++ int rt, ++ uint32_t *rt_bpp, ++#if V3D_VERSION == 42 ++ uint32_t *rt_type, ++ uint32_t *rt_clamp) ++#else ++ uint32_t *rt_type_clamp) ++#endif ++{ ++ const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; ++ ++ assert(state->subpass_idx < state->pass->subpass_count); ++ const struct v3dv_subpass *subpass = ++ &state->pass->subpasses[state->subpass_idx]; ++ ++ if (rt >= subpass->color_count) ++ return; ++ ++ struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt]; ++ const uint32_t attachment_idx = attachment->attachment; ++ if (attachment_idx == VK_ATTACHMENT_UNUSED) ++ return; ++ ++ assert(attachment_idx < state->framebuffer->attachment_count && ++ attachment_idx < state->attachment_alloc_count); ++ struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view; ++ assert(vk_format_is_color(iview->vk.format)); ++ ++ assert(iview->plane_count == 1); ++ *rt_bpp = iview->planes[0].internal_bpp; ++#if V3D_VERSION == 42 ++ *rt_type = iview->planes[0].internal_type; ++ *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type, ++ iview->vk.format); ++#endif ++#if V3D_VERSION >= 71 ++ *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type, ++ iview->vk.format); ++#endif ++} ++ + void + v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + { +@@ -939,10 +1036,20 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + */ + job->early_zs_clear = do_early_zs_clear; + ++#if V3D_VERSION >= 71 ++ uint32_t base_addr = 0; ++#endif + for (uint32_t i = 0; i < subpass->color_count; i++) { + uint32_t attachment_idx = subpass->color_attachments[i].attachment; +- if (attachment_idx == VK_ATTACHMENT_UNUSED) ++ if (attachment_idx == VK_ATTACHMENT_UNUSED) { ++#if V3D_VERSION >= 71 ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.render_target_number = i; ++ rt.stride = 1; /* Unused */ ++ } ++#endif + continue; ++ } + + struct v3dv_image_view *iview = + state->attachments[attachment_idx].image_view; +@@ -978,9 +1085,6 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + clear.render_target_number = i; + }; + #endif +-#if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); +-#endif + + if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) { + #if V3D_VERSION == 42 +@@ -1010,27 +1114,44 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + unreachable("HW generation 71 not supported yet."); + #endif + } ++ ++#if V3D_VERSION >= 71 ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.clear_color_low_bits = clear_color[0]; ++ cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp, ++ &rt.internal_type_and_clamping); ++ rt.stride = ++ v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width, ++ v3d_internal_bpp_words(rt.internal_bpp)); ++ rt.base_address = base_addr; ++ rt.render_target_number = i; ++ ++ /* base_addr in multiples of 512 bits. We divide by 8 because stride ++ * is in 128-bit units, but it is packing 2 rows worth of data, so we ++ * need to divide it by 2 so it is only 1 row, and then again by 4 so ++ * it is in 512-bit units. ++ */ ++ base_addr += (tiling->tile_height * rt.stride) / 8; ++ } ++#endif + } + + #if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { +- v3dX(cmd_buffer_render_pass_setup_render_target) ++ cmd_buffer_render_pass_setup_render_target + (cmd_buffer, 0, &rt.render_target_0_internal_bpp, + &rt.render_target_0_internal_type, &rt.render_target_0_clamp); +- v3dX(cmd_buffer_render_pass_setup_render_target) ++ cmd_buffer_render_pass_setup_render_target + (cmd_buffer, 1, &rt.render_target_1_internal_bpp, + &rt.render_target_1_internal_type, &rt.render_target_1_clamp); +- v3dX(cmd_buffer_render_pass_setup_render_target) ++ cmd_buffer_render_pass_setup_render_target + (cmd_buffer, 2, &rt.render_target_2_internal_bpp, + &rt.render_target_2_internal_type, &rt.render_target_2_clamp); +- v3dX(cmd_buffer_render_pass_setup_render_target) ++ cmd_buffer_render_pass_setup_render_target + (cmd_buffer, 3, &rt.render_target_3_internal_bpp, + &rt.render_target_3_internal_type, &rt.render_target_3_clamp); + } + #endif +-#if V3D_VERSION >= 71 +- unreachable("Hardware generation 71 not supported yet."); +-#endif + + /* Ends rendering mode config. */ + if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { +@@ -2445,46 +2566,3 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer, + buffer->mem_offset + offset); + } + } +- +-void +-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer, +- int rt, +- uint32_t *rt_bpp, +- uint32_t *rt_type, +- uint32_t *rt_clamp) +-{ +- const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; +- +- assert(state->subpass_idx < state->pass->subpass_count); +- const struct v3dv_subpass *subpass = +- &state->pass->subpasses[state->subpass_idx]; +- +- if (rt >= subpass->color_count) +- return; +- +- struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt]; +- const uint32_t attachment_idx = attachment->attachment; +- if (attachment_idx == VK_ATTACHMENT_UNUSED) +- return; +- +- assert(attachment_idx < state->framebuffer->attachment_count && +- attachment_idx < state->attachment_alloc_count); +- struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view; +- assert(vk_format_is_color(iview->vk.format)); +- +- assert(iview->plane_count == 1); +- *rt_bpp = iview->planes[0].internal_bpp; +- if (vk_format_is_int(iview->vk.view_format)) +-#if V3D_VERSION == 42 +- *rt_type = iview->planes[0].internal_type; +- if (vk_format_is_int(iview->vk.format)) +- *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT; +- else if (vk_format_is_srgb(iview->vk.view_format)) +- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM; +- else +- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE; +-#endif +-#if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); +-#endif +-} +diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c +index e4084d851fc..c6391bc6d83 100644 +--- a/src/broadcom/vulkan/v3dvx_meta_common.c ++++ b/src/broadcom/vulkan/v3dvx_meta_common.c +@@ -26,6 +26,7 @@ + + #include "broadcom/common/v3d_macros.h" + #include "broadcom/common/v3d_tfu.h" ++#include "broadcom/common/v3d_util.h" + #include "broadcom/cle/v3dx_pack.h" + #include "broadcom/compiler/v3d_compiler.h" + +@@ -150,7 +151,16 @@ emit_rcl_prologue(struct v3dv_job *job, + } + #endif + #if V3D_VERSION >= 71 +- unreachable("Hardware generation 71 not supported yet."); ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.internal_bpp = tiling->internal_bpp; ++ rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type, ++ fb->vk_format); ++ rt.stride = ++ v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width, ++ v3d_internal_bpp_words(rt.internal_bpp)); ++ rt.base_address = 0; ++ rt.render_target_number = 0; ++ } + #endif + + cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { +diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h +index ad8ddfa5731..a4157d11c7c 100644 +--- a/src/broadcom/vulkan/v3dvx_private.h ++++ b/src/broadcom/vulkan/v3dvx_private.h +@@ -125,13 +125,6 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color, + uint32_t internal_size, + uint32_t *hw_color); + +-void +-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer, +- int rt, +- uint32_t *rt_bpp, +- uint32_t *rt_type, +- uint32_t *rt_clamp); +- + /* Used at v3dv_device */ + + void +@@ -325,3 +318,7 @@ uint32_t v3dX(max_descriptor_bo_size)(void); + uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane); + + uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane); ++ ++uint32_t ++v3dX(clamp_for_format_and_type)(uint32_t rt_type, ++ VkFormat vk_format); +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0079-v3dvx-cmd_buffer-emit-CLEAR_RENDER_TARGETS-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0079-v3dvx-cmd_buffer-emit-CLEAR_RENDER_TARGETS-for-v71.patch new file mode 100644 index 0000000000..ee9e9d2074 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0079-v3dvx-cmd_buffer-emit-CLEAR_RENDER_TARGETS-for-v71.patch @@ -0,0 +1,25 @@ +From 7c89d8026fd550282d54933f37ffc2773869326f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Mon, 26 Jul 2021 15:08:11 +0200 +Subject: [PATCH 079/142] v3dvx/cmd_buffer: emit CLEAR_RENDER_TARGETS for v71 + +--- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index c6307890da5..ae1c21ae00b 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -1219,7 +1219,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + } + #endif + #if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); ++ cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt); + #endif + } + cl_emit(rcl, END_OF_TILE_MARKER, end); +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0080-v3dv-cmd_buffer-emit-CLIPPER_XY_SCALING-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0080-v3dv-cmd_buffer-emit-CLIPPER_XY_SCALING-for-v71.patch new file mode 100644 index 0000000000..a6507e3a17 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0080-v3dv-cmd_buffer-emit-CLIPPER_XY_SCALING-for-v71.patch @@ -0,0 +1,38 @@ +From 2eb29b57fde2acda76e12953b3a1050f3056b39d Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Sun, 19 Sep 2021 23:37:32 +0200 +Subject: [PATCH 080/142] v3dv/cmd_buffer: emit CLIPPER_XY_SCALING for v71 + +--- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index ae1c21ae00b..2e525a11619 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -1246,9 +1246,7 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) + * now, would need to change if we allow multiple viewports + */ + float *vptranslate = dynamic->viewport.translate[0]; +-#if V3D_VERSION == 42 + float *vpscale = dynamic->viewport.scale[0]; +-#endif + + struct v3dv_job *job = cmd_buffer->state.job; + assert(job); +@@ -1268,7 +1266,10 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) + } + #endif + #if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); ++ cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { ++ clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f; ++ clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f; ++ } + #endif + + float translate_z, scale_z; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0081-v3dv-uniforms-update-VIEWPORT_X-Y_SCALE-uniforms-for.patch b/projects/RPi/devices/RPi5/patches/mesa/0081-v3dv-uniforms-update-VIEWPORT_X-Y_SCALE-uniforms-for.patch new file mode 100644 index 0000000000..cb0d7512d3 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0081-v3dv-uniforms-update-VIEWPORT_X-Y_SCALE-uniforms-for.patch @@ -0,0 +1,97 @@ +From 611bf6a7445837c7e20416ff9f11a6dad9c543d7 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Tue, 14 Sep 2021 10:08:19 +0200 +Subject: [PATCH 081/142] v3dv/uniforms: update VIEWPORT_X/Y_SCALE uniforms for + v71 + +As the packet CLIPPER_XY scaling, this needs to be computed on 1/64ths +of pixel, instead of 1/256ths of pixels. + +As this is the usual values that we get from macros, we add manually a +v42 and v71 macro, and define a new helper (V3DV_X) to get the value +for the current hw version. +--- + src/broadcom/vulkan/v3dv_private.h | 17 +++++++++++++++++ + src/broadcom/vulkan/v3dv_uniforms.c | 7 ++++--- + src/broadcom/vulkan/v3dvx_private.h | 9 +++++++++ + 3 files changed, 30 insertions(+), 3 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h +index 6bdf338c67b..cd6811b19c2 100644 +--- a/src/broadcom/vulkan/v3dv_private.h ++++ b/src/broadcom/vulkan/v3dv_private.h +@@ -2617,6 +2617,23 @@ u64_compare(const void *key1, const void *key2) + v3d_X_thing; \ + }) + ++/* Helper to get hw-specific macro values */ ++#define V3DV_X(device, thing) ({ \ ++ __typeof(V3D42_##thing) V3D_X_THING; \ ++ switch (device->devinfo.ver) { \ ++ case 42: \ ++ V3D_X_THING = V3D42_##thing; \ ++ break; \ ++ case 71: \ ++ V3D_X_THING = V3D71_##thing; \ ++ break; \ ++ default: \ ++ unreachable("Unsupported hardware generation"); \ ++ } \ ++ V3D_X_THING; \ ++}) ++ ++ + + /* v3d_macros from common requires v3dX and V3DX definitions. Below we need to + * define v3dX for each version supported, because when we compile code that +diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c +index 72fa9a1b39c..0e681cc4ee2 100644 +--- a/src/broadcom/vulkan/v3dv_uniforms.c ++++ b/src/broadcom/vulkan/v3dv_uniforms.c +@@ -497,7 +497,8 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect); + + struct v3dv_cl_out *uniforms = cl_start(&job->indirect); +- ++ float clipper_xy_granularity = ++ V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY); + for (int i = 0; i < uinfo->count; i++) { + uint32_t data = uinfo->data[i]; + +@@ -520,11 +521,11 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, + break; + + case QUNIFORM_VIEWPORT_X_SCALE: +- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f); ++ cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity); + break; + + case QUNIFORM_VIEWPORT_Y_SCALE: +- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f); ++ cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity); + break; + + case QUNIFORM_VIEWPORT_Z_OFFSET: { +diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h +index a4157d11c7c..ff9ba75cf93 100644 +--- a/src/broadcom/vulkan/v3dvx_private.h ++++ b/src/broadcom/vulkan/v3dvx_private.h +@@ -319,6 +319,15 @@ uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane); + + uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane); + ++/* General utils */ ++ ++uint32_t ++v3dX(clamp_for_format_and_type)(uint32_t rt_type, ++ VkFormat vk_format); ++ ++#define V3D42_CLIPPER_XY_GRANULARITY 256.0f ++#define V3D71_CLIPPER_XY_GRANULARITY 64.0f ++ + uint32_t + v3dX(clamp_for_format_and_type)(uint32_t rt_type, + VkFormat vk_format); +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0082-v3dv-cmd_buffer-just-don-t-fill-up-early-z-fields-fo.patch b/projects/RPi/devices/RPi5/patches/mesa/0082-v3dv-cmd_buffer-just-don-t-fill-up-early-z-fields-fo.patch new file mode 100644 index 0000000000..8a77ae3708 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0082-v3dv-cmd_buffer-just-don-t-fill-up-early-z-fields-fo.patch @@ -0,0 +1,40 @@ +From 3819efaf2bb6fd8bd9cd45d54fb7254377b2296a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Tue, 27 Jul 2021 14:02:30 +0200 +Subject: [PATCH 082/142] v3dv/cmd_buffer: just don't fill up early-z fields + for CFG_BITS for v71 + +For v71 early_z_enable/early_z_updates_enable is configured with +packet 121. +--- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index 2e525a11619..fe9f7e43596 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -1783,17 +1783,14 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer) + v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS)); + v3dv_return_if_oom(cmd_buffer, NULL); + +-#if V3D_VERSION == 42 +- bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer); + cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) { ++#if V3D_VERSION == 42 ++ bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer); + config.early_z_enable = enable_ez; + config.early_z_updates_enable = config.early_z_enable && + pipeline->z_updates_enable; +- } +-#endif +-#if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); + #endif ++ } + } + + void +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0083-v3dv-default-vertex-attribute-values-are-gen-dependa.patch b/projects/RPi/devices/RPi5/patches/mesa/0083-v3dv-default-vertex-attribute-values-are-gen-dependa.patch new file mode 100644 index 0000000000..b37e2be950 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0083-v3dv-default-vertex-attribute-values-are-gen-dependa.patch @@ -0,0 +1,219 @@ +From e3b1a578f45ea830d790970115b6de978d56edb8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 28 Jul 2021 12:01:38 +0200 +Subject: [PATCH 083/142] v3dv: default vertex attribute values are gen + dependant + +Content, structure and size would depend on the generation. Even if it +is needed at all. + +So let's move it to the v3dvx files. +--- + src/broadcom/vulkan/v3dv_device.c | 2 +- + src/broadcom/vulkan/v3dv_pipeline.c | 61 ++------------------------- + src/broadcom/vulkan/v3dv_private.h | 4 -- + src/broadcom/vulkan/v3dvx_pipeline.c | 63 ++++++++++++++++++++++++++++ + src/broadcom/vulkan/v3dvx_private.h | 8 ++++ + 5 files changed, 75 insertions(+), 63 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c +index c8719d33f15..01e2dd7ac2d 100644 +--- a/src/broadcom/vulkan/v3dv_device.c ++++ b/src/broadcom/vulkan/v3dv_device.c +@@ -2043,7 +2043,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, + v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0, + device->instance->default_pipeline_cache_enabled); + device->default_attribute_float = +- v3dv_pipeline_create_default_attribute_values(device, NULL); ++ v3dv_X(device, create_default_attribute_values)(device, NULL); + + device->device_address_mem_ctx = ralloc_context(NULL); + util_dynarray_init(&device->device_address_bo_list, +diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c +index 22f01bdf64b..d012ff8f948 100644 +--- a/src/broadcom/vulkan/v3dv_pipeline.c ++++ b/src/broadcom/vulkan/v3dv_pipeline.c +@@ -2802,62 +2802,6 @@ pipeline_set_ez_state(struct v3dv_pipeline *pipeline, + } + } + +-static bool +-pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline) +-{ +- for (uint8_t i = 0; i < pipeline->va_count; i++) { +- if (vk_format_is_int(pipeline->va[i].vk_format)) +- return true; +- } +- return false; +-} +- +-/* @pipeline can be NULL. We assume in that case that all the attributes have +- * a float format (we only create an all-float BO once and we reuse it with +- * all float pipelines), otherwise we look at the actual type of each +- * attribute used with the specific pipeline passed in. +- */ +-struct v3dv_bo * +-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device, +- struct v3dv_pipeline *pipeline) +-{ +- uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4; +- struct v3dv_bo *bo; +- +- bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true); +- +- if (!bo) { +- fprintf(stderr, "failed to allocate memory for the default " +- "attribute values\n"); +- return NULL; +- } +- +- bool ok = v3dv_bo_map(device, bo, size); +- if (!ok) { +- fprintf(stderr, "failed to map default attribute values buffer\n"); +- return false; +- } +- +- uint32_t *attrs = bo->map; +- uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0; +- for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) { +- attrs[i * 4 + 0] = 0; +- attrs[i * 4 + 1] = 0; +- attrs[i * 4 + 2] = 0; +- VkFormat attr_format = +- pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED; +- if (i < va_count && vk_format_is_int(attr_format)) { +- attrs[i * 4 + 3] = 1; +- } else { +- attrs[i * 4 + 3] = fui(1.0); +- } +- } +- +- v3dv_bo_unmap(device, bo); +- +- return bo; +-} +- + static void + pipeline_set_sample_mask(struct v3dv_pipeline *pipeline, + const VkPipelineMultisampleStateCreateInfo *ms_info) +@@ -2992,9 +2936,10 @@ pipeline_init(struct v3dv_pipeline *pipeline, + + v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info); + +- if (pipeline_has_integer_vertex_attrib(pipeline)) { ++ if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) { + pipeline->default_attribute_values = +- v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline); ++ v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline); ++ + if (!pipeline->default_attribute_values) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } else { +diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h +index cd6811b19c2..a9fab24d19e 100644 +--- a/src/broadcom/vulkan/v3dv_private.h ++++ b/src/broadcom/vulkan/v3dv_private.h +@@ -2500,10 +2500,6 @@ void + v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline, + struct v3dv_pipeline_cache *cache); + +-struct v3dv_bo * +-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device, +- struct v3dv_pipeline *pipeline); +- + VkResult + v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device, + nir_shader *nir, +diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c +index 922698b08a2..e235220cb14 100644 +--- a/src/broadcom/vulkan/v3dvx_pipeline.c ++++ b/src/broadcom/vulkan/v3dvx_pipeline.c +@@ -664,3 +664,66 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline, + } + } + } ++ ++static bool ++pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline) ++{ ++ for (uint8_t i = 0; i < pipeline->va_count; i++) { ++ if (vk_format_is_int(pipeline->va[i].vk_format)) ++ return true; ++ } ++ return false; ++} ++ ++bool ++v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline) ++{ ++ return pipeline_has_integer_vertex_attrib(pipeline); ++} ++ ++/* @pipeline can be NULL. In that case we assume the most common case. For ++ * example, for v42 we assume in that case that all the attributes have a ++ * float format (we only create an all-float BO once and we reuse it with all ++ * float pipelines), otherwise we look at the actual type of each attribute ++ * used with the specific pipeline passed in. ++ */ ++struct v3dv_bo * ++v3dX(create_default_attribute_values)(struct v3dv_device *device, ++ struct v3dv_pipeline *pipeline) ++{ ++ uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4; ++ struct v3dv_bo *bo; ++ ++ bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true); ++ ++ if (!bo) { ++ fprintf(stderr, "failed to allocate memory for the default " ++ "attribute values\n"); ++ return NULL; ++ } ++ ++ bool ok = v3dv_bo_map(device, bo, size); ++ if (!ok) { ++ fprintf(stderr, "failed to map default attribute values buffer\n"); ++ return NULL; ++ } ++ ++ uint32_t *attrs = bo->map; ++ uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0; ++ for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) { ++ attrs[i * 4 + 0] = 0; ++ attrs[i * 4 + 1] = 0; ++ attrs[i * 4 + 2] = 0; ++ VkFormat attr_format = ++ pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED; ++ if (i < va_count && vk_format_is_int(attr_format)) { ++ attrs[i * 4 + 3] = 1; ++ } else { ++ attrs[i * 4 + 3] = fui(1.0); ++ } ++ } ++ ++ v3dv_bo_unmap(device, bo); ++ ++ return bo; ++} +diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h +index ff9ba75cf93..036ce11b455 100644 +--- a/src/broadcom/vulkan/v3dvx_private.h ++++ b/src/broadcom/vulkan/v3dvx_private.h +@@ -306,6 +306,14 @@ void + v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline, + const VkPipelineVertexInputStateCreateInfo *vi_info, + const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info); ++ ++bool ++v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline); ++ ++struct v3dv_bo * ++v3dX(create_default_attribute_values)(struct v3dv_device *device, ++ struct v3dv_pipeline *pipeline); ++ + /* Used at v3dv_queue */ + void + v3dX(job_emit_noop)(struct v3dv_job *job); +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0084-v3dv-pipeline-default-vertex-attributes-values-are-n.patch b/projects/RPi/devices/RPi5/patches/mesa/0084-v3dv-pipeline-default-vertex-attributes-values-are-n.patch new file mode 100644 index 0000000000..f33f20827d --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0084-v3dv-pipeline-default-vertex-attributes-values-are-n.patch @@ -0,0 +1,87 @@ +From 8464dc8869f3d2eccfecac7b4358cc0ffe05f081 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 28 Jul 2021 12:05:26 +0200 +Subject: [PATCH 084/142] v3dv/pipeline: default vertex attributes values are + not needed for v71 + +There are not part of the shader state record. +--- + src/broadcom/vulkan/v3dv_private.h | 10 +++++++++- + src/broadcom/vulkan/v3dvx_pipeline.c | 10 ++++++++++ + 2 files changed, 19 insertions(+), 1 deletion(-) + +diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h +index a9fab24d19e..300a1ec8ae1 100644 +--- a/src/broadcom/vulkan/v3dv_private.h ++++ b/src/broadcom/vulkan/v3dv_private.h +@@ -581,6 +581,10 @@ struct v3dv_device { + * being float being float, allowing us to reuse the same BO for all + * pipelines matching this requirement. Pipelines that need integer + * attributes will create their own BO. ++ * ++ * Note that since v71 the default attribute values are not needed, so this ++ * can be NULL. ++ * + */ + struct v3dv_bo *default_attribute_float; + +@@ -2289,11 +2293,15 @@ struct v3dv_pipeline { + unsigned char sha1[20]; + + /* In general we can reuse v3dv_device->default_attribute_float, so note +- * that the following can be NULL. ++ * that the following can be NULL. In 7.x this is not used, so it will be ++ * NULL. + * + * FIXME: the content of this BO will be small, so it could be improved to + * be uploaded to a common BO. But as in most cases it will be NULL, it is + * not a priority. ++ * ++ * Note that since v71 the default attribute values are not needed, so this ++ * can be NULL. + */ + struct v3dv_bo *default_attribute_values; + +diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c +index e235220cb14..4dc6d70efe1 100644 +--- a/src/broadcom/vulkan/v3dvx_pipeline.c ++++ b/src/broadcom/vulkan/v3dvx_pipeline.c +@@ -665,6 +665,7 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline, + } + } + ++#if V3D_VERSION == 42 + static bool + pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline) + { +@@ -674,11 +675,16 @@ pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline) + } + return false; + } ++#endif + + bool + v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline) + { ++#if V3D_VERSION == 42 + return pipeline_has_integer_vertex_attrib(pipeline); ++#endif ++ ++ return false; + } + + /* @pipeline can be NULL. In that case we assume the most common case. For +@@ -691,6 +697,10 @@ struct v3dv_bo * + v3dX(create_default_attribute_values)(struct v3dv_device *device, + struct v3dv_pipeline *pipeline) + { ++#if V3D_VERSION >= 71 ++ return NULL; ++#endif ++ + uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4; + struct v3dv_bo *bo; + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0085-v3dv-pipeline-handle-GL_SHADER_STATE_RECORD-changed-.patch b/projects/RPi/devices/RPi5/patches/mesa/0085-v3dv-pipeline-handle-GL_SHADER_STATE_RECORD-changed-.patch new file mode 100644 index 0000000000..0d8acd9826 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0085-v3dv-pipeline-handle-GL_SHADER_STATE_RECORD-changed-.patch @@ -0,0 +1,39 @@ +From 339096598660ec34be8087007dd4d66581de1c4e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 28 Jul 2021 13:45:52 +0200 +Subject: [PATCH 085/142] v3dv/pipeline: handle GL_SHADER_STATE_RECORD changed + size on v71 + +It is likely that we would need more changes, as this packet changed, +but this is enough to get basic tests running. Any additional support +will be handled with new commits. +--- + src/broadcom/vulkan/v3dvx_pipeline.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c +index 4dc6d70efe1..a640c1d084a 100644 +--- a/src/broadcom/vulkan/v3dvx_pipeline.c ++++ b/src/broadcom/vulkan/v3dvx_pipeline.c +@@ -360,7 +360,7 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline, + static void + pack_shader_state_record(struct v3dv_pipeline *pipeline) + { +- assert(sizeof(pipeline->shader_state_record) == ++ assert(sizeof(pipeline->shader_state_record) >= + cl_packet_length(GL_SHADER_STATE_RECORD)); + + struct v3d_fs_prog_data *prog_data_fs = +@@ -453,9 +453,6 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) + shader.vertex_shader_has_separate_input_and_output_vpm_blocks = + prog_data_vs->separate_segments; + #endif +-#if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); +-#endif + + shader.coordinate_shader_input_vpm_segment_size = + prog_data_vs_bin->separate_segments ? +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0086-v3dv-setup-render-pass-color-clears-for-any-format-b.patch b/projects/RPi/devices/RPi5/patches/mesa/0086-v3dv-setup-render-pass-color-clears-for-any-format-b.patch new file mode 100644 index 0000000000..b1d310f166 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0086-v3dv-setup-render-pass-color-clears-for-any-format-b.patch @@ -0,0 +1,89 @@ +From 5b1342eb1e255d17619b1a7b33eaf7b31f5e50a5 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 22 Sep 2021 12:03:58 +0200 +Subject: [PATCH 086/142] v3dv: setup render pass color clears for any format + bpp in v71 + +--- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 33 ++++++++++++++++---------- + 1 file changed, 20 insertions(+), 13 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index fe9f7e43596..1b39e230580 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -1064,7 +1064,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + UNUSED const uint32_t *clear_color = + &state->attachments[attachment_idx].clear_value.color[0]; + +- uint32_t clear_pad = 0; ++ UNUSED uint32_t clear_pad = 0; + if (slice->tiling == V3D_TILING_UIF_NO_XOR || + slice->tiling == V3D_TILING_UIF_XOR) { + int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2; +@@ -1084,10 +1084,8 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + clear.clear_color_next_24_bits = clear_color[1] & 0xffffff; + clear.render_target_number = i; + }; +-#endif + + if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) { +-#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) { + clear.clear_color_mid_low_32_bits = + ((clear_color[1] >> 24) | (clear_color[2] << 8)); +@@ -1095,25 +1093,16 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + ((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8)); + clear.render_target_number = i; + }; +-#endif +-#if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); +-#endif +- + } + + if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) { +-#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) { + clear.uif_padded_height_in_uif_blocks = clear_pad; + clear.clear_color_high_16_bits = clear_color[3] >> 16; + clear.render_target_number = i; + }; +-#endif +-#if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); +-#endif + } ++#endif + + #if V3D_VERSION >= 71 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { +@@ -1133,6 +1122,24 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + */ + base_addr += (tiling->tile_height * rt.stride) / 8; + } ++ ++ if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) { ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) { ++ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */ ++ ((uint64_t) clear_color[1]) | ++ (((uint64_t) (clear_color[2] & 0xff)) << 32); ++ rt.render_target_number = i; ++ } ++ } ++ ++ if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) { ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) { ++ rt.clear_color_top_bits = /* 56 bits (24 + 32) */ ++ (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) | ++ (((uint64_t) (clear_color[3])) << 24); ++ rt.render_target_number = i; ++ } ++ } + #endif + } + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0087-v3dv-setup-TLB-clear-color-for-meta-operations-in-v7.patch b/projects/RPi/devices/RPi5/patches/mesa/0087-v3dv-setup-TLB-clear-color-for-meta-operations-in-v7.patch new file mode 100644 index 0000000000..26e8475540 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0087-v3dv-setup-TLB-clear-color-for-meta-operations-in-v7.patch @@ -0,0 +1,126 @@ +From ff5b5d4405b1d5600d7f1c4355202fd303f56700 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 22 Sep 2021 12:04:21 +0200 +Subject: [PATCH 087/142] v3dv: setup TLB clear color for meta operations in + v71 + +--- + src/broadcom/vulkan/v3dvx_meta_common.c | 46 +++++++++++++++---------- + 1 file changed, 27 insertions(+), 19 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c +index c6391bc6d83..09ebcfa97c1 100644 +--- a/src/broadcom/vulkan/v3dvx_meta_common.c ++++ b/src/broadcom/vulkan/v3dvx_meta_common.c +@@ -75,8 +75,9 @@ emit_rcl_prologue(struct v3dv_job *job, + config.internal_depth_type = fb->internal_depth_type; + } + ++ const uint32_t *color = NULL; + if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) { +- uint32_t clear_pad = 0; ++ UNUSED uint32_t clear_pad = 0; + if (clear_info->image) { + const struct v3dv_image *image = clear_info->image; + +@@ -101,20 +102,16 @@ emit_rcl_prologue(struct v3dv_job *job, + } + } + ++ color = &clear_info->clear_value->color[0]; ++ + #if V3D_VERSION == 42 +- const uint32_t *color = &clear_info->clear_value->color[0]; + cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) { + clear.clear_color_low_32_bits = color[0]; + clear.clear_color_next_24_bits = color[1] & 0x00ffffff; + clear.render_target_number = 0; + }; +-#endif +-#if V3D_VERSION >= 71 +- unreachable("Hardware generation 71 not supported yet."); +-#endif + + if (tiling->internal_bpp >= V3D_INTERNAL_BPP_64) { +-#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) { + clear.clear_color_mid_low_32_bits = + ((color[1] >> 24) | (color[2] << 8)); +@@ -122,25 +119,16 @@ emit_rcl_prologue(struct v3dv_job *job, + ((color[2] >> 24) | ((color[3] & 0xffff) << 8)); + clear.render_target_number = 0; + }; +-#endif +-#if V3D_VERSION >= 71 +- unreachable("Hardware generation 71 not supported yet."); +-#endif +- + } + + if (tiling->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) { +-#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) { + clear.uif_padded_height_in_uif_blocks = clear_pad; + clear.clear_color_high_16_bits = color[3] >> 16; + clear.render_target_number = 0; + }; +-#endif +-#if V3D_VERSION >= 71 +- unreachable("Hardware generation 71 not supported yet."); +-#endif + } ++#endif + } + + #if V3D_VERSION == 42 +@@ -150,8 +138,11 @@ emit_rcl_prologue(struct v3dv_job *job, + rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE; + } + #endif ++ + #if V3D_VERSION >= 71 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ if (color) ++ rt.clear_color_low_bits = color[0]; + rt.internal_bpp = tiling->internal_bpp; + rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type, + fb->vk_format); +@@ -161,6 +152,24 @@ emit_rcl_prologue(struct v3dv_job *job, + rt.base_address = 0; + rt.render_target_number = 0; + } ++ ++ if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) { ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) { ++ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */ ++ ((uint64_t) color[1]) | ++ (((uint64_t) (color[2] & 0xff)) << 32); ++ rt.render_target_number = 0; ++ } ++ } ++ ++ if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) { ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) { ++ rt.clear_color_top_bits = /* 56 bits (24 + 32) */ ++ (((uint64_t) (color[2] & 0xffffff00)) >> 8) | ++ (((uint64_t) (color[3])) << 24); ++ rt.render_target_number = 0; ++ } ++ } + #endif + + cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { +@@ -229,9 +238,8 @@ emit_frame_setup(struct v3dv_job *job, + } + #endif + #if V3D_VERSION >= 71 +- unreachable("Hardware generation 71 not supported yet."); ++ cl_emit(rcl, CLEAR_RENDER_TARGETS, clear); + #endif +- + } + cl_emit(rcl, END_OF_TILE_MARKER, end); + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0088-v3dv-fix-up-texture-shader-state-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0088-v3dv-fix-up-texture-shader-state-for-v71.patch new file mode 100644 index 0000000000..2bf2de50b7 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0088-v3dv-fix-up-texture-shader-state-for-v71.patch @@ -0,0 +1,49 @@ +From 1e9d7d69849fa646b331f7661c74ee138badc4bb Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Mon, 25 Oct 2021 01:37:12 +0200 +Subject: [PATCH 088/142] v3dv: fix up texture shader state for v71 + +There are some new fields for YCbCr with pointers for the various +planes in multi-planar formats. These need to match the base address +pointer in the texture state, or the hardware will assume this is a +multi-planar texture. +--- + src/broadcom/vulkan/v3dvx_image.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c +index dac6ff2741f..848290c2a47 100644 +--- a/src/broadcom/vulkan/v3dvx_image.c ++++ b/src/broadcom/vulkan/v3dvx_image.c +@@ -129,6 +129,14 @@ pack_texture_shader_state_helper(struct v3dv_device *device, + v3dv_layer_offset(image, 0, image_view->vk.base_array_layer, + iplane); + tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset); ++ ++#if V3D_VERSION >= 71 ++ tex.chroma_offset_x = 1; ++ tex.chroma_offset_y = 1; ++ /* See comment in XML field definition for rationale of the shifts */ ++ tex.texture_base_pointer_cb = base_offset >> 6; ++ tex.texture_base_pointer_cr = base_offset >> 6; ++#endif + } + } + } +@@ -191,5 +199,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device, + buffer_view->offset; + + tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset); ++ ++#if V3D_VERSION >= 71 ++ tex.chroma_offset_x = 1; ++ tex.chroma_offset_y = 1; ++ /* See comment in XML field definition for rationale of the shifts */ ++ tex.texture_base_pointer_cb = base_offset >> 6; ++ tex.texture_base_pointer_cr = base_offset >> 6; ++#endif + } + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0089-v3dv-handle-new-texture-state-transfer-functions-in-.patch b/projects/RPi/devices/RPi5/patches/mesa/0089-v3dv-handle-new-texture-state-transfer-functions-in-.patch new file mode 100644 index 0000000000..7647e30707 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0089-v3dv-handle-new-texture-state-transfer-functions-in-.patch @@ -0,0 +1,52 @@ +From 1f150a3a92741f7654a13626bd5b27b5575f2b76 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Mon, 25 Oct 2021 01:38:31 +0200 +Subject: [PATCH 089/142] v3dv: handle new texture state transfer functions in + v71 + +--- + src/broadcom/vulkan/v3dvx_image.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c +index 848290c2a47..437d4588c7e 100644 +--- a/src/broadcom/vulkan/v3dvx_image.c ++++ b/src/broadcom/vulkan/v3dvx_image.c +@@ -108,15 +108,16 @@ pack_texture_shader_state_helper(struct v3dv_device *device, + + tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64; + ++ bool is_srgb = vk_format_is_srgb(image_view->vk.format); + #if V3D_VERSION == 42 + tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse; + #endif + + #if V3D_VERSION == 42 +- tex.srgb = vk_format_is_srgb(image_view->vk.view_format); ++ tex.srgb = is_srgb; + #endif + #if V3D_VERSION >= 71 +- unreachable("Hardware generation 71 not supported yet."); ++ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; + #endif + + /* At this point we don't have the job. That's the reason the first +@@ -181,11 +182,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device, + + assert(buffer_view->format->plane_count == 1); + tex.texture_type = buffer_view->format->planes[0].tex_type; ++ ++ bool is_srgb = vk_format_is_srgb(buffer_view->vk_format); + #if V3D_VERSION == 42 +- tex.srgb = vk_format_is_srgb(buffer_view->vk_format); ++ tex.srgb = is_srgb; + #endif + #if V3D_VERSION >= 71 +- unreachable("Hardware generation 71 not supported yet."); ++ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; + #endif + + /* At this point we don't have the job. That's the reason the first +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0090-v3dv-implement-noop-job-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0090-v3dv-implement-noop-job-for-v71.patch new file mode 100644 index 0000000000..69401c2100 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0090-v3dv-implement-noop-job-for-v71.patch @@ -0,0 +1,42 @@ +From 45de9f019ee92635de9a505db58439f0f4561281 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 28 Sep 2021 08:14:11 +0200 +Subject: [PATCH 090/142] v3dv: implement noop job for v71 + +--- + src/broadcom/vulkan/v3dvx_queue.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c +index 1a26d04aef7..f8cee36e3bf 100644 +--- a/src/broadcom/vulkan/v3dvx_queue.c ++++ b/src/broadcom/vulkan/v3dvx_queue.c +@@ -46,7 +46,8 @@ v3dX(job_emit_noop)(struct v3dv_job *job) + config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32; + #endif + #if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); ++ config.log2_tile_width = 3; /* Tile size 64 */ ++ config.log2_tile_height = 3; /* Tile size 64 */ + #endif + } + +@@ -58,10 +59,13 @@ v3dX(job_emit_noop)(struct v3dv_job *job) + } + #endif + #if V3D_VERSION >= 71 +- unreachable("Hardware generation 71 not supported yet."); ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.internal_bpp = V3D_INTERNAL_BPP_32; ++ rt.internal_type_and_clamping = V3D_RENDER_TARGET_TYPE_CLAMP_8; ++ rt.stride = 1; /* Unused RT */ ++ } + #endif + +- + cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { + clear.z_clear_value = 1.0f; + clear.stencil_clear_value = 0; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0091-v3dv-handle-render-pass-global-clear-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0091-v3dv-handle-render-pass-global-clear-for-v71.patch new file mode 100644 index 0000000000..066e45d424 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0091-v3dv-handle-render-pass-global-clear-for-v71.patch @@ -0,0 +1,117 @@ +From 3e607bb28056bb52242be6878281efae84026813 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 28 Sep 2021 08:23:48 +0200 +Subject: [PATCH 091/142] v3dv: handle render pass global clear for v71 + +--- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 66 ++++++++++++++++---------- + 1 file changed, 41 insertions(+), 25 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index 1b39e230580..48b2e319e51 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -362,6 +362,11 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer, + iview->vk.base_array_layer + layer, + image_plane); + ++ /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it ++ * is broken in earlier V3D versions. ++ */ ++ assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear); ++ + cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) { + store.buffer_to_store = buffer; + store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset); +@@ -484,6 +489,30 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, + const VkImageAspectFlags aspects = + vk_format_aspects(ds_attachment->desc.format); + ++#if V3D_VERSION <= 42 ++ /* GFXH-1689: The per-buffer store command's clear buffer bit is broken ++ * for depth/stencil. ++ * ++ * There used to be some confusion regarding the Clear Tile Buffers ++ * Z/S bit also being broken, but we confirmed with Broadcom that this ++ * is not the case, it was just that some other hardware bugs (that we ++ * need to work around, such as GFXH-1461) could cause this bit to behave ++ * incorrectly. ++ * ++ * There used to be another issue where the RTs bit in the Clear Tile ++ * Buffers packet also cleared Z/S, but Broadcom confirmed this is ++ * fixed since V3D 4.1. ++ * ++ * So if we have to emit a clear of depth or stencil we don't use ++ * the per-buffer store clear bit, even if we need to store the buffers, ++ * instead we always have to use the Clear Tile Buffers Z/S bit. ++ * If we have configured the job to do early Z/S clearing, then we ++ * don't want to emit any Clear Tile Buffers command at all here. ++ * ++ * Note that GFXH-1689 is not reproduced in the simulator, where ++ * using the clear buffer bit in depth/stencil stores works fine. ++ */ ++ + /* Only clear once on the first subpass that uses the attachment */ + uint32_t ds_first_subpass = !state->pass->multiview_enabled ? + ds_attachment->first_subpass : +@@ -503,6 +532,17 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, + ds_attachment->desc.stencilLoadOp, + subpass->do_stencil_clear_with_draw); + ++ use_global_zs_clear = !state->job->early_zs_clear && ++ (needs_depth_clear || needs_stencil_clear); ++#endif ++#if V3D_VERSION >= 71 ++ /* The store command's clear buffer bit cannot be used for Z/S stencil: ++ * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles, ++ * so we don't want to emit redundant clears here. ++ */ ++ use_global_zs_clear = false; ++#endif ++ + /* Skip the last store if it is not required */ + uint32_t ds_last_subpass = !pass->multiview_enabled ? + ds_attachment->last_subpass : +@@ -545,30 +585,6 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, + needs_stencil_store = subpass->resolve_stencil; + } + +- /* GFXH-1689: The per-buffer store command's clear buffer bit is broken +- * for depth/stencil. +- * +- * There used to be some confusion regarding the Clear Tile Buffers +- * Z/S bit also being broken, but we confirmed with Broadcom that this +- * is not the case, it was just that some other hardware bugs (that we +- * need to work around, such as GFXH-1461) could cause this bit to behave +- * incorrectly. +- * +- * There used to be another issue where the RTs bit in the Clear Tile +- * Buffers packet also cleared Z/S, but Broadcom confirmed this is +- * fixed since V3D 4.1. +- * +- * So if we have to emit a clear of depth or stencil we don't use +- * the per-buffer store clear bit, even if we need to store the buffers, +- * instead we always have to use the Clear Tile Buffers Z/S bit. +- * If we have configured the job to do early Z/S clearing, then we +- * don't want to emit any Clear Tile Buffers command at all here. +- * +- * Note that GFXH-1689 is not reproduced in the simulator, where +- * using the clear buffer bit in depth/stencil stores works fine. +- */ +- use_global_zs_clear = !state->job->early_zs_clear && +- (needs_depth_clear || needs_stencil_clear); + if (needs_depth_store || needs_stencil_store) { + const uint32_t zs_buffer = + v3dv_zs_buffer(needs_depth_store, needs_stencil_store); +@@ -673,7 +689,7 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, + } + #endif + #if V3D_VERSION >= 71 +- unreachable("Hardware generation 71 not supported yet."); ++ cl_emit(cl, CLEAR_RENDER_TARGETS, clear); + #endif + } + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0092-v3dv-GFX-1461-does-not-affect-V3D-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0092-v3dv-GFX-1461-does-not-affect-V3D-7.x.patch new file mode 100644 index 0000000000..0251f31b56 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0092-v3dv-GFX-1461-does-not-affect-V3D-7.x.patch @@ -0,0 +1,32 @@ +From 3794f6f08c559c4e442b57e992d501fb7d515b9b Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 28 Sep 2021 08:31:04 +0200 +Subject: [PATCH 092/142] v3dv: GFX-1461 does not affect V3D 7.x + +--- + src/broadcom/vulkan/v3dv_pass.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c +index 20f5014268d..3e82c15df88 100644 +--- a/src/broadcom/vulkan/v3dv_pass.c ++++ b/src/broadcom/vulkan/v3dv_pass.c +@@ -236,11 +236,13 @@ v3dv_CreateRenderPass2(VkDevice _device, + + /* GFXH-1461: if depth is cleared but stencil is loaded (or vice versa), + * the clear might get lost. If a subpass has this then we can't emit +- * the clear using the TLB and we have to do it as a draw call. ++ * the clear using the TLB and we have to do it as a draw call. This ++ * issue is fixed since V3D 4.3.18. + * + * FIXME: separate stencil. + */ +- if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) { ++ if (device->devinfo.ver == 42 && ++ subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) { + struct v3dv_render_pass_attachment *att = + &pass->attachments[subpass->ds_attachment.attachment]; + if (att->desc.format == VK_FORMAT_D24_UNORM_S8_UINT) { +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0093-v3dv-update-thread-end-restrictions-validation-for-v.patch b/projects/RPi/devices/RPi5/patches/mesa/0093-v3dv-update-thread-end-restrictions-validation-for-v.patch new file mode 100644 index 0000000000..2b9aa1538c --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0093-v3dv-update-thread-end-restrictions-validation-for-v.patch @@ -0,0 +1,69 @@ +From 5be7f484210103e40b77fa3135042da4a8406659 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 28 Sep 2021 08:59:08 +0200 +Subject: [PATCH 093/142] v3dv: update thread end restrictions validation for + v71 + +--- + src/broadcom/compiler/qpu_validate.c | 37 +++++++++++++++++++++++++--- + 1 file changed, 34 insertions(+), 3 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c +index 1082fb7d50a..0466ee5d0b6 100644 +--- a/src/broadcom/compiler/qpu_validate.c ++++ b/src/broadcom/compiler/qpu_validate.c +@@ -316,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) + inst->type == V3D_QPU_INSTR_TYPE_ALU) { + if ((inst->alu.add.op != V3D_QPU_A_NOP && + !inst->alu.add.magic_write)) { +- fail_instr(state, "RF write after THREND"); ++ if (devinfo->ver <= 42) { ++ fail_instr(state, "RF write after THREND"); ++ } else if (devinfo->ver >= 71) { ++ if (state->last_thrsw_ip - state->ip == 0) { ++ fail_instr(state, ++ "ADD RF write at THREND"); ++ } ++ if (inst->alu.add.waddr == 2 || ++ inst->alu.add.waddr == 3) { ++ fail_instr(state, ++ "RF2-3 write after THREND"); ++ } ++ } + } + + if ((inst->alu.mul.op != V3D_QPU_M_NOP && + !inst->alu.mul.magic_write)) { +- fail_instr(state, "RF write after THREND"); ++ if (devinfo->ver <= 42) { ++ fail_instr(state, "RF write after THREND"); ++ } else if (devinfo->ver >= 71) { ++ if (state->last_thrsw_ip - state->ip == 0) { ++ fail_instr(state, ++ "MUL RF write at THREND"); ++ } ++ ++ if (inst->alu.mul.waddr == 2 || ++ inst->alu.mul.waddr == 3) { ++ fail_instr(state, ++ "RF2-3 write after THREND"); ++ } ++ } + } + + if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && + !inst->sig_magic) { +- fail_instr(state, "RF write after THREND"); ++ if (devinfo->ver <= 42) { ++ fail_instr(state, "RF write after THREND"); ++ } else if (devinfo->ver >= 71 && ++ (inst->sig_addr == 2 || ++ inst->sig_addr == 3)) { ++ fail_instr(state, "RF2-3 write after THREND"); ++ } + } + + /* GFXH-1625: No TMUWT in the last instruction */ +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0094-v3dv-handle-early-Z-S-clears-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0094-v3dv-handle-early-Z-S-clears-for-v71.patch new file mode 100644 index 0000000000..50989e8ea6 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0094-v3dv-handle-early-Z-S-clears-for-v71.patch @@ -0,0 +1,68 @@ +From a751dff57b6d769f5b031054cc65415cc3b44c08 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 29 Sep 2021 08:22:59 +0200 +Subject: [PATCH 094/142] v3dv: handle early Z/S clears for v71 + +--- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 30 ++++++++++++++++++++------ + 1 file changed, 23 insertions(+), 7 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index 48b2e319e51..4580e2a4650 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -998,6 +998,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + * Early-Z/S clearing is independent of Early Z/S testing, so it is + * possible to enable one but not the other so long as their + * respective requirements are met. ++ * ++ * From V3D 4.5.6, Z/S buffers are always cleared automatically ++ * between tiles, but we still want to enable early ZS clears ++ * when Z/S are not loaded or stored. + */ + struct v3dv_render_pass_attachment *ds_attachment = + &pass->attachments[ds_attachment_idx]; +@@ -1005,21 +1009,33 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + const VkImageAspectFlags ds_aspects = + vk_format_aspects(ds_attachment->desc.format); + +- bool needs_depth_clear = +- check_needs_clear(state, +- ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, +- ds_attachment->first_subpass, +- ds_attachment->desc.loadOp, +- subpass->do_depth_clear_with_draw); +- + bool needs_depth_store = + v3dv_cmd_buffer_check_needs_store(state, + ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + ds_attachment->last_subpass, + ds_attachment->desc.storeOp) || + subpass->resolve_depth; ++#if V3D_VERSION <= 42 ++ bool needs_depth_clear = ++ check_needs_clear(state, ++ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, ++ ds_attachment->first_subpass, ++ ds_attachment->desc.loadOp, ++ subpass->do_depth_clear_with_draw); + + do_early_zs_clear = needs_depth_clear && !needs_depth_store; ++#endif ++#if V3D_VERSION >= 71 ++ bool needs_depth_load = ++ v3dv_cmd_buffer_check_needs_load(state, ++ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, ++ ds_attachment->first_subpass, ++ ds_attachment->desc.loadOp, ++ ds_attachment->last_subpass, ++ ds_attachment->desc.storeOp); ++ do_early_zs_clear = !needs_depth_load && !needs_depth_store; ++#endif ++ + if (do_early_zs_clear && + vk_format_has_stencil(ds_attachment->desc.format)) { + bool needs_stencil_load = +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0095-v3dv-handle-RTs-with-no-color-targets-in-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0095-v3dv-handle-RTs-with-no-color-targets-in-v71.patch new file mode 100644 index 0000000000..11ab68bfb4 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0095-v3dv-handle-RTs-with-no-color-targets-in-v71.patch @@ -0,0 +1,34 @@ +From 2add46ebce4760bf8349606201324ee0e6b1f9da Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 29 Sep 2021 09:07:28 +0200 +Subject: [PATCH 095/142] v3dv: handle RTs with no color targets in v71 + +--- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index 4580e2a4650..750486a6ccf 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -1175,6 +1175,17 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + #endif + } + ++#if V3D_VERSION >= 71 ++ /* If we don't have any color RTs, we still need to emit one and flag ++ * it as not used using stride = 1. ++ */ ++ if (subpass->color_count == 0) { ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.stride = 1; ++ } ++ } ++#endif ++ + #if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { + cmd_buffer_render_pass_setup_render_target +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0096-v3dv-no-specific-separate_segments-flag-for-V3D-7.1.patch b/projects/RPi/devices/RPi5/patches/mesa/0096-v3dv-no-specific-separate_segments-flag-for-V3D-7.1.patch new file mode 100644 index 0000000000..10f1c52764 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0096-v3dv-no-specific-separate_segments-flag-for-V3D-7.1.patch @@ -0,0 +1,85 @@ +From 019abbd34d2d904d6bb33f9fa4433cb53ca7899c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Fri, 1 Oct 2021 15:18:38 +0200 +Subject: [PATCH 096/142] v3dv: no specific separate_segments flag for V3D 7.1 + +On V3D 7.1 there is not a flag on the Shader State Record to specify +if we are using shared or separate segments. This is done by setting +the vpm input size to 0 (so we need to ensure that the output would be +the max needed for input/output). + +We were already doing the latter on the prog_data_vs, so we just need +to use those values, instead of assigning default values. + +As we are here, we also add some comments on the compiler part. +--- + src/broadcom/compiler/qpu_schedule.c | 4 ++++ + src/broadcom/compiler/vir.c | 4 ++++ + src/broadcom/vulkan/v3dvx_pipeline.c | 15 +++++++++++++-- + 3 files changed, 21 insertions(+), 2 deletions(-) + +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index 77fb6a794e6..4f767296860 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -297,6 +297,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) + /* If the input and output segments are shared, then all VPM reads to + * a location need to happen before all writes. We handle this by + * serializing all VPM operations for now. ++ * ++ * FIXME: we are assuming that the segments are shared. That is ++ * correct right now as we are only using shared, but technically you ++ * can choose. + */ + bool separate_vpm_segment = false; + +diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c +index 7612eed7130..dd0aa761c43 100644 +--- a/src/broadcom/compiler/vir.c ++++ b/src/broadcom/compiler/vir.c +@@ -745,6 +745,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c, + + /* Set us up for shared input/output segments. This is apparently + * necessary for our VCM setup to avoid varying corruption. ++ * ++ * FIXME: initially testing on V3D 7.1 seems to work fine when using ++ * separate segments. So we could try to reevaluate in the future, if ++ * there is any advantage of using separate segments. + */ + prog_data->separate_segments = false; + prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size, +diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c +index a640c1d084a..a72ca3c241b 100644 +--- a/src/broadcom/vulkan/v3dvx_pipeline.c ++++ b/src/broadcom/vulkan/v3dvx_pipeline.c +@@ -452,14 +452,25 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) + prog_data_vs_bin->separate_segments; + shader.vertex_shader_has_separate_input_and_output_vpm_blocks = + prog_data_vs->separate_segments; +-#endif +- + shader.coordinate_shader_input_vpm_segment_size = + prog_data_vs_bin->separate_segments ? + prog_data_vs_bin->vpm_input_size : 1; + shader.vertex_shader_input_vpm_segment_size = + prog_data_vs->separate_segments ? + prog_data_vs->vpm_input_size : 1; ++#endif ++ ++ /* On V3D 7.1 there isn't a specific flag to set if we are using ++ * shared/separate segments or not. We just set the value of ++ * vpm_input_size to 0, and set output to the max needed. That should be ++ * already properly set on prog_data_vs_bin ++ */ ++#if V3D_VERSION == 71 ++ shader.coordinate_shader_input_vpm_segment_size = ++ prog_data_vs_bin->vpm_input_size; ++ shader.vertex_shader_input_vpm_segment_size = ++ prog_data_vs->vpm_input_size; ++#endif + + shader.coordinate_shader_output_vpm_segment_size = + prog_data_vs_bin->vpm_output_size; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0097-v3dv-don-t-convert-floating-point-border-colors-in-v.patch b/projects/RPi/devices/RPi5/patches/mesa/0097-v3dv-don-t-convert-floating-point-border-colors-in-v.patch new file mode 100644 index 0000000000..d0018b9f0e --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0097-v3dv-don-t-convert-floating-point-border-colors-in-v.patch @@ -0,0 +1,39 @@ +From 4f6b4f91577ec04aab907d59d836d0c17731a9d0 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Thu, 7 Oct 2021 12:43:49 +0200 +Subject: [PATCH 097/142] v3dv: don't convert floating point border colors in + v71 + +The TMU does this for us now. +--- + src/broadcom/vulkan/v3dvx_device.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c +index e235983864c..72daefadb08 100644 +--- a/src/broadcom/vulkan/v3dvx_device.c ++++ b/src/broadcom/vulkan/v3dvx_device.c +@@ -118,7 +118,11 @@ static union pipe_color_union encode_border_color( + (1 << (desc->channel[i].size - 1)) - 1); + } + +- /* convert from float to expected format */ ++#if V3D_VERSION <= 42 ++ /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions ++ * for us. In V3D 4.x we need to manually convert floating point color ++ * values to the expected format. ++ */ + if (vk_format_is_srgb(bc_info->format) || + vk_format_is_compressed(bc_info->format)) { + for (int i = 0; i < 4; i++) +@@ -170,6 +174,7 @@ static union pipe_color_union encode_border_color( + } + } + } ++#endif + + return border; + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0098-v3dv-handle-Z-clipping-in-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0098-v3dv-handle-Z-clipping-in-v71.patch new file mode 100644 index 0000000000..aec7084bd4 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0098-v3dv-handle-Z-clipping-in-v71.patch @@ -0,0 +1,60 @@ +From d8083cb8f104e0f035f5b812e000a500fa52d66f Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Fri, 15 Oct 2021 13:06:31 +0200 +Subject: [PATCH 098/142] v3dv: handle Z clipping in v71 + +Fixes the following tests: + +dEQP-VK.clipping.clip_volume.* +dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_* (except deltazero) +--- + src/broadcom/vulkan/v3dvx_pipeline.c | 33 ++++++++++++++++++++++++++++ + 1 file changed, 33 insertions(+) + +diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c +index a72ca3c241b..7b1133f8173 100644 +--- a/src/broadcom/vulkan/v3dvx_pipeline.c ++++ b/src/broadcom/vulkan/v3dvx_pipeline.c +@@ -227,6 +227,39 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline, + ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false; + + pipeline->z_updates_enable = config.z_updates_enable; ++ ++#if V3D_VERSION >= 71 ++ /* From the Vulkan spec: ++ * ++ * "depthClampEnable controls whether to clamp the fragment’s depth ++ * values as described in Depth Test. If the pipeline is not created ++ * with VkPipelineRasterizationDepthClipStateCreateInfoEXT present ++ * then enabling depth clamp will also disable clipping primitives to ++ * the z planes of the frustrum as described in Primitive Clipping. ++ * Otherwise depth clipping is controlled by the state set in ++ * VkPipelineRasterizationDepthClipStateCreateInfoEXT." ++ * ++ * Note: neither depth clamping nor VK_EXT_depth_clip_enable are actually ++ * supported in the driver yet, so in practice we are always enabling Z ++ * clipping for now. ++ */ ++ bool z_clip_enable = false; ++ const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info = ++ ds_info ? vk_find_struct_const(ds_info->pNext, ++ PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT) : ++ NULL; ++ if (clip_info) ++ z_clip_enable = clip_info->depthClipEnable; ++ else if (!(rs_info && rs_info->depthClampEnable)) ++ z_clip_enable = true; ++ ++ if (z_clip_enable) { ++ config.z_clipping_mode = pipeline->negative_one_to_one ? ++ V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_ZERO_TO_ONE; ++ } else { ++ config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE; ++ } ++#endif + }; + } + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0099-broadcom-common-add-TFU-register-definitions-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0099-broadcom-common-add-TFU-register-definitions-for-v71.patch new file mode 100644 index 0000000000..d69b668ccf --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0099-broadcom-common-add-TFU-register-definitions-for-v71.patch @@ -0,0 +1,44 @@ +From 2925fa6dc936d9268a59d8d7d4a775e89fd3fbdb Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 17 Nov 2021 11:33:59 +0100 +Subject: [PATCH 099/142] broadcom/common: add TFU register definitions for v71 + +--- + src/broadcom/common/v3d_tfu.h | 23 +++++++++++++++++++++++ + 1 file changed, 23 insertions(+) + +diff --git a/src/broadcom/common/v3d_tfu.h b/src/broadcom/common/v3d_tfu.h +index 80da224ca2d..572d0074794 100644 +--- a/src/broadcom/common/v3d_tfu.h ++++ b/src/broadcom/common/v3d_tfu.h +@@ -48,4 +48,27 @@ + #define V3D33_TFU_ICFG_FORMAT_UIF_NO_XOR 14 + #define V3D33_TFU_ICFG_FORMAT_UIF_XOR 15 + ++/* Disable level 0 write, just write following mipmaps */ ++#define V3D71_TFU_IOC_DIMTW (1 << 0) ++#define V3D71_TFU_IOC_FORMAT_SHIFT 12 ++#define V3D71_TFU_IOC_FORMAT_LINEARTILE 3 ++#define V3D71_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4 ++#define V3D71_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5 ++#define V3D71_TFU_IOA_FORMAT_UIF_NO_XOR 6 ++#define V3D71_TFU_IOA_FORMAT_UIF_XOR 7 ++ ++#define V3D71_TFU_IOC_STRIDE_SHIFT 16 ++#define V3D71_TFU_IOC_NUMMM_SHIFT 4 ++ ++#define V3D71_TFU_ICFG_OTYPE_SHIFT 16 ++#define V3D71_TFU_ICFG_IFORMAT_SHIFT 23 ++#define V3D71_TFU_ICFG_FORMAT_RASTER 0 ++#define V3D71_TFU_ICFG_FORMAT_SAND_128 1 ++#define V3D71_TFU_ICFG_FORMAT_SAND_256 2 ++#define V3D71_TFU_ICFG_FORMAT_LINEARTILE 11 ++#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12 ++#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13 ++#define V3D71_TFU_ICFG_FORMAT_UIF_NO_XOR 14 ++#define V3D71_TFU_ICFG_FORMAT_UIF_XOR 15 ++ + #endif +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0100-broadcom-simulator-TFU-register-names-changed-for-v7.patch b/projects/RPi/devices/RPi5/patches/mesa/0100-broadcom-simulator-TFU-register-names-changed-for-v7.patch new file mode 100644 index 0000000000..8f275d0f02 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0100-broadcom-simulator-TFU-register-names-changed-for-v7.patch @@ -0,0 +1,67 @@ +From 6d10aa8a64e009d4d1f4f05885621bd2d9a72465 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Thu, 23 Sep 2021 13:09:41 +0200 +Subject: [PATCH 100/142] broadcom/simulator: TFU register names changed for + v71 + +--- + src/broadcom/simulator/v3dx_simulator.c | 39 +++++++++++++++---------- + 1 file changed, 23 insertions(+), 16 deletions(-) + +diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c +index f23b0538de3..494f44a6b5d 100644 +--- a/src/broadcom/simulator/v3dx_simulator.c ++++ b/src/broadcom/simulator/v3dx_simulator.c +@@ -182,26 +182,33 @@ v3d_flush_caches(struct v3d_hw *v3d) + v3d_flush_l2t(v3d); + } + ++#if V3D_VERSION < 71 ++#define TFU_REG(NAME) V3D_TFU_ ## NAME ++#else ++#define TFU_REG(NAME) V3D_IFC_ ## NAME ++#endif ++ ++ + int + v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d, + struct drm_v3d_submit_tfu *args) + { +- int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET; +- +- V3D_WRITE(V3D_TFU_IIA, args->iia); +- V3D_WRITE(V3D_TFU_IIS, args->iis); +- V3D_WRITE(V3D_TFU_ICA, args->ica); +- V3D_WRITE(V3D_TFU_IUA, args->iua); +- V3D_WRITE(V3D_TFU_IOA, args->ioa); +- V3D_WRITE(V3D_TFU_IOS, args->ios); +- V3D_WRITE(V3D_TFU_COEF0, args->coef[0]); +- V3D_WRITE(V3D_TFU_COEF1, args->coef[1]); +- V3D_WRITE(V3D_TFU_COEF2, args->coef[2]); +- V3D_WRITE(V3D_TFU_COEF3, args->coef[3]); +- +- V3D_WRITE(V3D_TFU_ICFG, args->icfg); +- +- while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) { ++ int last_vtct = V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET; ++ ++ V3D_WRITE(TFU_REG(IIA), args->iia); ++ V3D_WRITE(TFU_REG(IIS), args->iis); ++ V3D_WRITE(TFU_REG(ICA), args->ica); ++ V3D_WRITE(TFU_REG(IUA), args->iua); ++ V3D_WRITE(TFU_REG(IOA), args->ioa); ++ V3D_WRITE(TFU_REG(IOS), args->ios); ++ V3D_WRITE(TFU_REG(COEF0), args->coef[0]); ++ V3D_WRITE(TFU_REG(COEF1), args->coef[1]); ++ V3D_WRITE(TFU_REG(COEF2), args->coef[2]); ++ V3D_WRITE(TFU_REG(COEF3), args->coef[3]); ++ ++ V3D_WRITE(TFU_REG(ICFG), args->icfg); ++ ++ while ((V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET) == last_vtct) { + v3d_hw_tick(v3d); + } + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0101-v3dv-add-support-for-TFU-jobs-in-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0101-v3dv-add-support-for-TFU-jobs-in-v71.patch new file mode 100644 index 0000000000..bf9e2ccdcd --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0101-v3dv-add-support-for-TFU-jobs-in-v71.patch @@ -0,0 +1,119 @@ +From 780f012747f2cc6e816b1955081dbeca9a0abe5c Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Thu, 23 Sep 2021 12:12:18 +0200 +Subject: [PATCH 101/142] v3dv: add support for TFU jobs in v71 + +--- + include/drm-uapi/v3d_drm.h | 5 ++++ + src/broadcom/simulator/v3dx_simulator.c | 3 ++ + src/broadcom/vulkan/v3dvx_meta_common.c | 37 +++++++++++++++++++++++++ + 3 files changed, 45 insertions(+) + +diff --git a/include/drm-uapi/v3d_drm.h b/include/drm-uapi/v3d_drm.h +index 3dfc0af8756..1a7d7a689de 100644 +--- a/include/drm-uapi/v3d_drm.h ++++ b/include/drm-uapi/v3d_drm.h +@@ -319,6 +319,11 @@ struct drm_v3d_submit_tfu { + + /* Pointer to an array of ioctl extensions*/ + __u64 extensions; ++ ++ struct { ++ __u32 ioc; ++ __u32 pad; ++ } v71; + }; + + /* Submits a compute shader for dispatch. This job will block on any +diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c +index 494f44a6b5d..4ea177c9bb7 100644 +--- a/src/broadcom/simulator/v3dx_simulator.c ++++ b/src/broadcom/simulator/v3dx_simulator.c +@@ -200,6 +200,9 @@ v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d, + V3D_WRITE(TFU_REG(ICA), args->ica); + V3D_WRITE(TFU_REG(IUA), args->iua); + V3D_WRITE(TFU_REG(IOA), args->ioa); ++#if V3D_VERSION >= 71 ++ V3D_WRITE(TFU_REG(IOC), args->v71.ioc); ++#endif + V3D_WRITE(TFU_REG(IOS), args->ios); + V3D_WRITE(TFU_REG(COEF0), args->coef[0]); + V3D_WRITE(TFU_REG(COEF1), args->coef[1]); +diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c +index 09ebcfa97c1..b8f3297bc94 100644 +--- a/src/broadcom/vulkan/v3dvx_meta_common.c ++++ b/src/broadcom/vulkan/v3dvx_meta_common.c +@@ -950,6 +950,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, + + tfu.iia |= src_offset; + ++#if V3D_VERSION <= 42 + if (src_tiling == V3D_TILING_RASTER) { + tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT; + } else { +@@ -958,12 +959,46 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, + V3D33_TFU_ICFG_FORMAT_SHIFT; + } + tfu.icfg |= format_plane->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT; ++#endif ++#if V3D_VERSION >= 71 ++ if (src_tiling == V3D_TILING_RASTER) { ++ tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT; ++ } else { ++ tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE + ++ (src_tiling - V3D_TILING_LINEARTILE)) << ++ V3D71_TFU_ICFG_IFORMAT_SHIFT; ++ } ++ tfu.icfg |= format_plane->tex_type << V3D71_TFU_ICFG_OTYPE_SHIFT; ++#endif + + tfu.ioa = dst_offset; + ++#if V3D_VERSION <= 42 + tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE + + (dst_tiling - V3D_TILING_LINEARTILE)) << + V3D33_TFU_IOA_FORMAT_SHIFT; ++#endif ++ ++#if V3D_VERSION >= 71 ++ tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE + ++ (dst_tiling - V3D_TILING_LINEARTILE)) << ++ V3D71_TFU_IOC_FORMAT_SHIFT; ++ ++ switch (dst_tiling) { ++ case V3D_TILING_UIF_NO_XOR: ++ case V3D_TILING_UIF_XOR: ++ tfu.v71.ioc |= ++ (dst_padded_height_or_stride / (2 * v3d_utile_height(dst_cpp))) << ++ V3D71_TFU_IOC_STRIDE_SHIFT; ++ break; ++ case V3D_TILING_RASTER: ++ tfu.v71.ioc |= (dst_padded_height_or_stride / dst_cpp) << ++ V3D71_TFU_IOC_STRIDE_SHIFT; ++ break; ++ default: ++ break; ++ } ++#endif + + switch (src_tiling) { + case V3D_TILING_UIF_NO_XOR: +@@ -980,6 +1015,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, + /* The TFU can handle raster sources but always produces UIF results */ + assert(dst_tiling != V3D_TILING_RASTER); + ++#if V3D_VERSION <= 42 + /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the + * OPAD field for the destination (how many extra UIF blocks beyond + * those necessary to cover the height). +@@ -991,6 +1027,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, + uif_block_h; + tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT; + } ++#endif + + v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu); + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0102-v3dv-make-v3dv_viewport_compute_xform-depend-on-the-.patch b/projects/RPi/devices/RPi5/patches/mesa/0102-v3dv-make-v3dv_viewport_compute_xform-depend-on-the-.patch new file mode 100644 index 0000000000..946565c402 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0102-v3dv-make-v3dv_viewport_compute_xform-depend-on-the-.patch @@ -0,0 +1,155 @@ +From 07cba940af2fe0c40641816bee280b57a40973fb Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 20 Oct 2021 11:22:11 +0200 +Subject: [PATCH 102/142] v3dv: make v3dv_viewport_compute_xform depend on the + V3D version + +For 4.x we have a workaround for too small Z scale values that is +not required for V3D 7.x. +--- + src/broadcom/vulkan/v3dv_cmd_buffer.c | 40 +++----------------------- + src/broadcom/vulkan/v3dv_pipeline.c | 7 +++-- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 37 ++++++++++++++++++++++++ + src/broadcom/vulkan/v3dvx_private.h | 5 ++++ + 4 files changed, 50 insertions(+), 39 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c +index 96360a96b44..bda0a614523 100644 +--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c +@@ -2131,39 +2131,6 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer, + } + } + +-/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */ +-void +-v3dv_viewport_compute_xform(const VkViewport *viewport, +- float scale[3], +- float translate[3]) +-{ +- float x = viewport->x; +- float y = viewport->y; +- float half_width = 0.5f * viewport->width; +- float half_height = 0.5f * viewport->height; +- double n = viewport->minDepth; +- double f = viewport->maxDepth; +- +- scale[0] = half_width; +- translate[0] = half_width + x; +- scale[1] = half_height; +- translate[1] = half_height + y; +- +- scale[2] = (f - n); +- translate[2] = n; +- +- /* It seems that if the scale is small enough the hardware won't clip +- * correctly so we work around this my choosing the smallest scale that +- * seems to work. +- * +- * This case is exercised by CTS: +- * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero +- */ +- const float min_abs_scale = 0.000009f; +- if (fabs(scale[2]) < min_abs_scale) +- scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale; +-} +- + /* Considers the pipeline's negative_one_to_one state and applies it to the + * current viewport transform if needed to produce the resulting Z translate + * and scale parameters. +@@ -2216,9 +2183,10 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer, + viewportCount * sizeof(*pViewports)); + + for (uint32_t i = firstViewport; i < total_count; i++) { +- v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i], +- state->dynamic.viewport.scale[i], +- state->dynamic.viewport.translate[i]); ++ v3dv_X(cmd_buffer->device, viewport_compute_xform) ++ (&state->dynamic.viewport.viewports[i], ++ state->dynamic.viewport.scale[i], ++ state->dynamic.viewport.translate[i]); + } + + cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT; +diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c +index d012ff8f948..2156176d4cc 100644 +--- a/src/broadcom/vulkan/v3dv_pipeline.c ++++ b/src/broadcom/vulkan/v3dv_pipeline.c +@@ -2661,9 +2661,10 @@ pipeline_init_dynamic_state( + pViewportState->viewportCount); + + for (uint32_t i = 0; i < dynamic->viewport.count; i++) { +- v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i], +- dynamic->viewport.scale[i], +- dynamic->viewport.translate[i]); ++ v3dv_X(pipeline->device, viewport_compute_xform) ++ (&dynamic->viewport.viewports[i], ++ dynamic->viewport.scale[i], ++ dynamic->viewport.translate[i]); + } + } + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index 750486a6ccf..f7c13a22423 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -1285,6 +1285,43 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + cl_emit(rcl, END_OF_RENDERING, end); + } + ++void ++v3dX(viewport_compute_xform)(const VkViewport *viewport, ++ float scale[3], ++ float translate[3]) ++{ ++ float x = viewport->x; ++ float y = viewport->y; ++ float half_width = 0.5f * viewport->width; ++ float half_height = 0.5f * viewport->height; ++ double n = viewport->minDepth; ++ double f = viewport->maxDepth; ++ ++ scale[0] = half_width; ++ translate[0] = half_width + x; ++ scale[1] = half_height; ++ translate[1] = half_height + y; ++ ++ scale[2] = (f - n); ++ translate[2] = n; ++ ++ /* It seems that if the scale is small enough the hardware won't clip ++ * correctly so we work around this my choosing the smallest scale that ++ * seems to work. ++ * ++ * This case is exercised by CTS: ++ * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero ++ * ++ * V3D 7.x fixes this by using the new ++ * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND. ++ */ ++#if V3D_VERSION <= 42 ++ const float min_abs_scale = 0.0005f; ++ if (fabs(scale[2]) < min_abs_scale) ++ scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale; ++#endif ++} ++ + void + v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) + { +diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h +index 036ce11b455..81715520913 100644 +--- a/src/broadcom/vulkan/v3dvx_private.h ++++ b/src/broadcom/vulkan/v3dvx_private.h +@@ -339,3 +339,8 @@ v3dX(clamp_for_format_and_type)(uint32_t rt_type, + uint32_t + v3dX(clamp_for_format_and_type)(uint32_t rt_type, + VkFormat vk_format); ++ ++void ++v3dX(viewport_compute_xform)(const VkViewport *viewport, ++ float scale[3], ++ float translate[3]); +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0103-v3dv-fix-depth-clipping-then-Z-scale-is-too-small-in.patch b/projects/RPi/devices/RPi5/patches/mesa/0103-v3dv-fix-depth-clipping-then-Z-scale-is-too-small-in.patch new file mode 100644 index 0000000000..82f934720c --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0103-v3dv-fix-depth-clipping-then-Z-scale-is-too-small-in.patch @@ -0,0 +1,51 @@ +From c6b60ee47c50474030f8a0a92bd4c6a071f926dc Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 14 Feb 2023 10:09:53 +0100 +Subject: [PATCH 103/142] v3dv: fix depth clipping then Z scale is too small in + V3D 7.x + +When the Z scale is too small guardband clipping may not clip +correctly, so disable it, which is a new option in V3D 7.x. + +This fixes this test in V3D 7.x without needing any workarounds: +dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero +--- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index f7c13a22423..3566649aafd 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -1363,10 +1363,28 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) + v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0, + &translate_z, &scale_z); + ++#if V3D_VERSION == 42 + cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) { + clip.viewport_z_offset_zc_to_zs = translate_z; + clip.viewport_z_scale_zc_to_zs = scale_z; + } ++#endif ++ ++#if V3D_VERSION >= 71 ++ /* If the Z scale is too small guardband clipping may not clip correctly */ ++ if (fabsf(scale_z) < 0.01f) { ++ cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) { ++ clip.viewport_z_offset_zc_to_zs = translate_z; ++ clip.viewport_z_scale_zc_to_zs = scale_z; ++ } ++ } else { ++ cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) { ++ clip.viewport_z_offset_zc_to_zs = translate_z; ++ clip.viewport_z_scale_zc_to_zs = scale_z; ++ } ++ } ++#endif ++ + cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) { + /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled, + * we are using OpenGL's [-1, 1] instead. +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0104-v3d-add-a-non-conformant-warning-for-not-fully-suppo.patch b/projects/RPi/devices/RPi5/patches/mesa/0104-v3d-add-a-non-conformant-warning-for-not-fully-suppo.patch new file mode 100644 index 0000000000..83c6351641 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0104-v3d-add-a-non-conformant-warning-for-not-fully-suppo.patch @@ -0,0 +1,30 @@ +From 46e2b22f43290e6fe92f5435af174c4b18bb6ef5 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 21 Oct 2021 22:52:47 +0200 +Subject: [PATCH 104/142] v3d: add a non-conformant warning for not fully + supported hw + +--- + src/gallium/drivers/v3d/v3d_screen.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c +index 98ca9bb69e6..efdb7d615ae 100644 +--- a/src/gallium/drivers/v3d/v3d_screen.c ++++ b/src/gallium/drivers/v3d/v3d_screen.c +@@ -922,6 +922,12 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config, + if (!v3d_get_device_info(screen->fd, &screen->devinfo, &v3d_ioctl)) + goto fail; + ++ if (screen->devinfo.ver >= 71) { ++ fprintf(stderr, "WARNING: v3d support for hw version %i is neither " ++ "a complete nor a conformant OpenGL implementation. Testing " ++ "use only.\n", screen->devinfo.ver); ++ } ++ + driParseConfigFiles(config->options, config->options_info, 0, "v3d", + NULL, NULL, NULL, 0, NULL, 0); + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0105-v3d-add-v71-hw-generation.patch b/projects/RPi/devices/RPi5/patches/mesa/0105-v3d-add-v71-hw-generation.patch new file mode 100644 index 0000000000..07bed87a0c --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0105-v3d-add-v71-hw-generation.patch @@ -0,0 +1,336 @@ +From 46ffdc57ac7fbe71e92b22e1fe93185f3d33a3ac Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Tue, 23 May 2023 23:32:37 +0200 +Subject: [PATCH 105/142] v3d: add v71 hw generation + +Starting point for v71 version inclusion: + * Adds as one of the versions to be compiled on meson + * Updated the v3d_X and v3dX macros to include version 71 + * Update the code enough to get it building when using v71. + +Any real v71 support will be implemented on following commits. +--- + src/gallium/drivers/v3d/meson.build | 2 +- + src/gallium/drivers/v3d/v3d_context.h | 22 +++++++++++++---- + src/gallium/drivers/v3d/v3dx_draw.c | 21 +++++++++++++--- + src/gallium/drivers/v3d/v3dx_emit.c | 11 +++++++++ + src/gallium/drivers/v3d/v3dx_rcl.c | 35 ++++++++++++++++++++++----- + src/gallium/drivers/v3d/v3dx_state.c | 12 +++++++++ + 6 files changed, 88 insertions(+), 15 deletions(-) + +diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build +index dfa1e88097b..526a131ae9b 100644 +--- a/src/gallium/drivers/v3d/meson.build ++++ b/src/gallium/drivers/v3d/meson.build +@@ -58,7 +58,7 @@ if dep_v3dv3.found() + v3d_args += '-DUSE_V3D_SIMULATOR' + endif + +-v3d_versions = ['33', '42'] ++v3d_versions = ['33', '42', '71'] + + per_version_libs = [] + foreach ver : v3d_versions +diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h +index 97850b0363e..ad267d5033c 100644 +--- a/src/gallium/drivers/v3d/v3d_context.h ++++ b/src/gallium/drivers/v3d/v3d_context.h +@@ -818,13 +818,21 @@ void v3d_disk_cache_store(struct v3d_context *v3d, + + /* Helper to call hw ver specific functions */ + #define v3d_X(devinfo, thing) ({ \ +- __typeof(&v3d42_##thing) v3d_X_thing; \ +- if ((devinfo)->ver >= 42) \ +- v3d_X_thing = &v3d42_##thing; \ +- else if ((devinfo)->ver >= 33) \ ++ __typeof(&v3d33_##thing) v3d_X_thing; \ ++ switch (devinfo->ver) { \ ++ case 33: \ ++ case 40: \ + v3d_X_thing = &v3d33_##thing; \ +- else \ ++ break; \ ++ case 42: \ ++ v3d_X_thing = &v3d42_##thing; \ ++ break; \ ++ case 71: \ ++ v3d_X_thing = &v3d71_##thing; \ ++ break; \ ++ default: \ + unreachable("Unsupported hardware generation"); \ ++ } \ + v3d_X_thing; \ + }) + +@@ -838,6 +846,10 @@ void v3d_disk_cache_store(struct v3d_context *v3d, + # define v3dX(x) v3d42_##x + # include "v3dx_context.h" + # undef v3dX ++ ++# define v3dX(x) v3d71_##x ++# include "v3dx_context.h" ++# undef v3dX + #endif + + #endif /* V3D_CONTEXT_H */ +diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c +index 17442500ea9..2c74c5973c9 100644 +--- a/src/gallium/drivers/v3d/v3dx_draw.c ++++ b/src/gallium/drivers/v3d/v3dx_draw.c +@@ -95,7 +95,11 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job) + #endif + + assert(!job->msaa || !job->double_buffer); +-#if V3D_VERSION >= 40 ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif ++ ++#if V3D_VERSION >= 40 && V3D_VERSION <= 42 + cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { + config.width_in_pixels = job->draw_width; + config.height_in_pixels = job->draw_height; +@@ -107,7 +111,8 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job) + + config.maximum_bpp_of_all_render_targets = job->internal_bpp; + } +-#else /* V3D_VERSION < 40 */ ++#endif ++#if V3D_VERSION < 40 + /* "Binning mode lists start with a Tile Binning Mode Configuration + * item (120)" + * +@@ -134,7 +139,7 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job) + + config.maximum_bpp_of_all_render_targets = job->internal_bpp; + } +-#endif /* V3D_VERSION < 40 */ ++#endif + + /* There's definitely nothing in the VCD cache we want. */ + cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin); +@@ -655,10 +660,15 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, + /* XXX: Use combined input/output size flag in the common + * case. + */ ++#if V3D_VERSION <= 42 + shader.coordinate_shader_has_separate_input_and_output_vpm_blocks = + v3d->prog.cs->prog_data.vs->separate_segments; + shader.vertex_shader_has_separate_input_and_output_vpm_blocks = + v3d->prog.vs->prog_data.vs->separate_segments; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + + shader.coordinate_shader_input_vpm_segment_size = + v3d->prog.cs->prog_data.vs->separate_segments ? +@@ -724,9 +734,14 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, + shader.instance_id_read_by_vertex_shader = + v3d->prog.vs->prog_data.vs->uses_iid; + ++#if V3D_VERSION <= 42 + shader.address_of_default_attribute_values = + cl_address(v3d_resource(vtx->defaults)->bo, + vtx->defaults_offset); ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + } + + bool cs_loaded_any = false; +diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c +index 0ad3fb68b1e..5af3d03b337 100644 +--- a/src/gallium/drivers/v3d/v3dx_emit.c ++++ b/src/gallium/drivers/v3d/v3dx_emit.c +@@ -512,6 +512,7 @@ v3dX(emit_state)(struct pipe_context *pctx) + /* Note: EZ state may update based on the compiled FS, + * along with ZSA + */ ++#if V3D_VERSION <= 42 + config.early_z_updates_enable = + (job->ez_state != V3D_EZ_DISABLED); + if (v3d->zsa->base.depth_enabled) { +@@ -524,6 +525,10 @@ v3dX(emit_state)(struct pipe_context *pctx) + } else { + config.depth_test_function = PIPE_FUNC_ALWAYS; + } ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + + config.stencil_enable = + v3d->zsa->base.stencil[0].enabled; +@@ -564,12 +569,18 @@ v3dX(emit_state)(struct pipe_context *pctx) + } + + if (v3d->dirty & V3D_DIRTY_VIEWPORT) { ++#if V3D_VERSION <= 42 + cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { + clip.viewport_half_width_in_1_256th_of_pixel = + v3d->viewport.scale[0] * 256.0f; + clip.viewport_half_height_in_1_256th_of_pixel = + v3d->viewport.scale[1] * 256.0f; + } ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif ++ + + cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) { + clip.viewport_z_offset_zc_to_zs = +diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c +index 82547437c25..166cc34e4ee 100644 +--- a/src/gallium/drivers/v3d/v3dx_rcl.c ++++ b/src/gallium/drivers/v3d/v3dx_rcl.c +@@ -419,10 +419,16 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer) + * clearing Z/S. + */ + if (job->clear) { ++#if V3D_VERSION <= 42 + cl_emit(cl, CLEAR_TILE_BUFFERS, clear) { + clear.clear_z_stencil_buffer = !job->early_zs_clear; + clear.clear_all_render_targets = true; + } ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif ++ + } + #endif /* V3D_VERSION >= 40 */ + } +@@ -483,7 +489,7 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer) + } + } + +-#if V3D_VERSION >= 40 ++#if V3D_VERSION >= 40 && V3D_VERSION <= 42 + static void + v3d_setup_render_target(struct v3d_job *job, int cbuf, + uint32_t *rt_bpp, uint32_t *rt_type, uint32_t *rt_clamp) +@@ -507,9 +513,9 @@ v3d_setup_render_target(struct v3d_job *job, int cbuf, + else + *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE; + } ++#endif + +-#else /* V3D_VERSION < 40 */ +- ++#if V3D_VERSION < 40 + static void + v3d_emit_z_stencil_config(struct v3d_job *job, struct v3d_surface *surf, + struct v3d_resource *rsc, bool is_separate_stencil) +@@ -656,7 +662,8 @@ emit_render_layer(struct v3d_job *job, uint32_t layer) + cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) { + store.buffer_to_store = NONE; + } +-#else ++#endif ++#if V3D_VERSION >= 40 && V3D_VERSION <= 42 + for (int i = 0; i < 2; i++) { + if (i > 0) + cl_emit(&job->rcl, TILE_COORDINATES, coords); +@@ -673,6 +680,10 @@ emit_render_layer(struct v3d_job *job, uint32_t layer) + cl_emit(&job->rcl, END_OF_TILE_MARKER, end); + } + #endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif ++ + + cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush); + +@@ -775,7 +786,13 @@ v3dX(emit_rcl)(struct v3d_job *job) + config.multisample_mode_4x = job->msaa; + config.double_buffer_in_non_ms_mode = job->double_buffer; + ++#if V3D_VERSION <= 42 + config.maximum_bpp_of_all_render_targets = job->internal_bpp; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif ++ + } + + for (int i = 0; i < job->nr_cbufs; i++) { +@@ -786,7 +803,7 @@ v3dX(emit_rcl)(struct v3d_job *job) + struct v3d_resource *rsc = v3d_resource(psurf->texture); + + UNUSED uint32_t config_pad = 0; +- uint32_t clear_pad = 0; ++ UNUSED uint32_t clear_pad = 0; + + /* XXX: Set the pad for raster. */ + if (surf->tiling == V3D_TILING_UIF_NO_XOR || +@@ -819,6 +836,7 @@ v3dX(emit_rcl)(struct v3d_job *job) + } + #endif /* V3D_VERSION < 40 */ + ++#if V3D_VERSION <= 42 + cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, + clear) { + clear.clear_color_low_32_bits = job->clear_color[i][0]; +@@ -847,9 +865,10 @@ v3dX(emit_rcl)(struct v3d_job *job) + clear.render_target_number = i; + }; + } ++#endif + } + +-#if V3D_VERSION >= 40 ++#if V3D_VERSION >= 40 && V3D_VERSION <= 42 + cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { + v3d_setup_render_target(job, 0, + &rt.render_target_0_internal_bpp, +@@ -870,6 +889,10 @@ v3dX(emit_rcl)(struct v3d_job *job) + } + #endif + ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif ++ + #if V3D_VERSION < 40 + /* FIXME: Don't bother emitting if we don't load/clear Z/S. */ + if (job->zsbuf) { +diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c +index 0f1735fee66..a93d5be091e 100644 +--- a/src/gallium/drivers/v3d/v3dx_state.c ++++ b/src/gallium/drivers/v3d/v3dx_state.c +@@ -990,7 +990,13 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d, + cso->u.buf.size); + } + ++#if V3D_VERSION <= 42 + tex.srgb = util_format_is_srgb(cso->format); ++#endif ++ ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + + #if V3D_VERSION >= 40 + tex.swizzle_r = v3d_translate_pipe_swizzle(so->swizzle[0]); +@@ -1040,7 +1046,13 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d, + * shader code if we wanted to read an MSAA sRGB + * texture without sRGB decode. + */ ++#if V3D_VERSION <= 42 + tex.srgb = false; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif ++ + } else { + tex.texture_type = v3d_get_tex_format(&screen->devinfo, + cso->format); +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0106-v3d-emit-TILE_BINNING_MODE_CFG-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0106-v3d-emit-TILE_BINNING_MODE_CFG-for-v71.patch new file mode 100644 index 0000000000..dafba1550e --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0106-v3d-emit-TILE_BINNING_MODE_CFG-for-v71.patch @@ -0,0 +1,39 @@ +From 1ef6241854666a00d43401039809f2470d3a2cc0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 20 Oct 2021 14:31:10 +0200 +Subject: [PATCH 106/142] v3d: emit TILE_BINNING_MODE_CFG for v71 + +--- + src/gallium/drivers/v3d/v3dx_draw.c | 16 +++++++++++++++- + 1 file changed, 15 insertions(+), 1 deletion(-) + +diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c +index 2c74c5973c9..9f38baa0bbf 100644 +--- a/src/gallium/drivers/v3d/v3dx_draw.c ++++ b/src/gallium/drivers/v3d/v3dx_draw.c +@@ -96,7 +96,21 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job) + + assert(!job->msaa || !job->double_buffer); + #if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); ++ cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { ++ config.width_in_pixels = job->draw_width; ++ config.height_in_pixels = job->draw_height; ++ ++ config.log2_tile_width = log2_tile_size(job->tile_width); ++ config.log2_tile_height = log2_tile_size(job->tile_height); ++ ++ /* FIXME: ideallly we would like next assert on the packet header (as is ++ * general, so also applies to GL). We would need to expand ++ * gen_pack_header for that. ++ */ ++ assert(config.log2_tile_width == config.log2_tile_height || ++ config.log2_tile_width == config.log2_tile_height + 1); ++ } ++ + #endif + + #if V3D_VERSION >= 40 && V3D_VERSION <= 42 +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0107-v3d-emit-TILE_RENDERING_MODE_CFG_COMMON-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0107-v3d-emit-TILE_RENDERING_MODE_CFG_COMMON-for-v71.patch new file mode 100644 index 0000000000..f3bfe3eac3 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0107-v3d-emit-TILE_RENDERING_MODE_CFG_COMMON-for-v71.patch @@ -0,0 +1,44 @@ +From dfdfcf3853d7178acff288a368dfc169018c186a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 20 Oct 2021 14:42:43 +0200 +Subject: [PATCH 107/142] v3d: emit TILE_RENDERING_MODE_CFG_COMMON for v71 + +--- + src/gallium/drivers/v3d/v3dx_rcl.c | 13 +++++++++++-- + 1 file changed, 11 insertions(+), 2 deletions(-) + +diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c +index 166cc34e4ee..3f5eb293c4e 100644 +--- a/src/gallium/drivers/v3d/v3dx_rcl.c ++++ b/src/gallium/drivers/v3d/v3dx_rcl.c +@@ -23,8 +23,9 @@ + + #include "util/format/u_format.h" + #include "v3d_context.h" +-#include "broadcom/common/v3d_tiling.h" + #include "broadcom/common/v3d_macros.h" ++#include "broadcom/common/v3d_tiling.h" ++#include "broadcom/common/v3d_util.h" + #include "broadcom/cle/v3dx_pack.h" + + #define PIPE_CLEAR_COLOR_BUFFERS (PIPE_CLEAR_COLOR0 | \ +@@ -790,7 +791,15 @@ v3dX(emit_rcl)(struct v3d_job *job) + config.maximum_bpp_of_all_render_targets = job->internal_bpp; + #endif + #if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); ++ config.log2_tile_width = log2_tile_size(job->tile_width); ++ config.log2_tile_height = log2_tile_size(job->tile_height); ++ ++ /* FIXME: ideallly we would like next assert on the packet header (as is ++ * general, so also applies to GL). We would need to expand ++ * gen_pack_header for that. ++ */ ++ assert(config.log2_tile_width == config.log2_tile_height || ++ config.log2_tile_width == config.log2_tile_height + 1); + #endif + + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0108-v3d-TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1.patch b/projects/RPi/devices/RPi5/patches/mesa/0108-v3d-TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1.patch new file mode 100644 index 0000000000..de56d89812 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0108-v3d-TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1.patch @@ -0,0 +1,186 @@ +From 34b32f1ee504449e39529110631c389fa9e9e409 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 20 Oct 2021 15:12:15 +0200 +Subject: [PATCH 108/142] v3d: TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1 + +--- + src/gallium/drivers/v3d/v3dx_rcl.c | 130 +++++++++++++++++++++++++---- + 1 file changed, 115 insertions(+), 15 deletions(-) + +diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c +index 3f5eb293c4e..815e1098c22 100644 +--- a/src/gallium/drivers/v3d/v3dx_rcl.c ++++ b/src/gallium/drivers/v3d/v3dx_rcl.c +@@ -490,10 +490,86 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer) + } + } + ++#if V3D_VERSION > 33 ++/* Note that for v71, render target cfg packets has just one field that ++ * combined the internal type and clamp mode. For simplicity we keep just one ++ * helper. ++ * ++ * Note: rt_type is in fact a "enum V3DX(Internal_Type)". ++ * ++ */ ++static uint32_t ++v3dX(clamp_for_format_and_type)(uint32_t rt_type, ++ enum pipe_format format) ++{ ++#if V3D_VERSION == 42 ++ if (util_format_is_pure_integer(format)) { ++ return V3D_RENDER_TARGET_CLAMP_INT; ++ } else if (util_format_is_srgb(format)) { ++ return V3D_RENDER_TARGET_CLAMP_NORM; ++ } else { ++ return V3D_RENDER_TARGET_CLAMP_NONE; ++ } ++#endif ++#if V3D_VERSION >= 71 ++ switch (rt_type) { ++ case V3D_INTERNAL_TYPE_8I: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED; ++ case V3D_INTERNAL_TYPE_8UI: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED; ++ case V3D_INTERNAL_TYPE_8: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_8; ++ case V3D_INTERNAL_TYPE_16I: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED; ++ case V3D_INTERNAL_TYPE_16UI: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED; ++ case V3D_INTERNAL_TYPE_16F: ++ return util_format_is_srgb(format) ? ++ V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM : ++ V3D_RENDER_TARGET_TYPE_CLAMP_16F; ++ case V3D_INTERNAL_TYPE_32I: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED; ++ case V3D_INTERNAL_TYPE_32UI: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED; ++ case V3D_INTERNAL_TYPE_32F: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_32F; ++ default: ++ unreachable("Unknown internal render target type"); ++ } ++ return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID; ++#endif ++ return 0; ++} ++#endif ++ ++#if V3D_VERSION >= 71 ++static void ++v3d_setup_render_target(struct v3d_job *job, ++ int cbuf, ++ uint32_t *rt_bpp, ++ uint32_t *rt_type_clamp) ++{ ++ if (!job->cbufs[cbuf]) ++ return; ++ ++ struct v3d_surface *surf = v3d_surface(job->cbufs[cbuf]); ++ *rt_bpp = surf->internal_bpp; ++ if (job->bbuf) { ++ struct v3d_surface *bsurf = v3d_surface(job->bbuf); ++ *rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp); ++ } ++ *rt_type_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type, ++ surf->base.format); ++} ++#endif ++ + #if V3D_VERSION >= 40 && V3D_VERSION <= 42 + static void +-v3d_setup_render_target(struct v3d_job *job, int cbuf, +- uint32_t *rt_bpp, uint32_t *rt_type, uint32_t *rt_clamp) ++v3d_setup_render_target(struct v3d_job *job, ++ int cbuf, ++ uint32_t *rt_bpp, ++ uint32_t *rt_type, ++ uint32_t *rt_clamp) + { + if (!job->cbufs[cbuf]) + return; +@@ -505,14 +581,8 @@ v3d_setup_render_target(struct v3d_job *job, int cbuf, + *rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp); + } + *rt_type = surf->internal_type; +- if (util_format_is_srgb(surf->base.format)) +- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM; +-#if V3D_VERSION >= 42 +- else if (util_format_is_pure_integer(surf->base.format)) +- *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT; +-#endif +- else +- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE; ++ *rt_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type, ++ surf->base.format); + } + #endif + +@@ -804,10 +874,30 @@ v3dX(emit_rcl)(struct v3d_job *job) + + } + ++#if V3D_VERSION >= 71 ++ uint32_t base_addr = 0; ++ ++ /* If we don't have any color RTs, we sill need to emit one and flat ++ * it as not used using stride = 1 ++ */ ++ if (job->nr_cbufs == 0) { ++ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.stride = 1; /* Unused */ ++ } ++ } ++#endif + for (int i = 0; i < job->nr_cbufs; i++) { + struct pipe_surface *psurf = job->cbufs[i]; +- if (!psurf) ++ if (!psurf) { ++#if V3D_VERSION >= 71 ++ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.render_target_number = i; ++ rt.stride = 1; /* Unused */ ++ } ++#endif + continue; ++ } ++ + struct v3d_surface *surf = v3d_surface(psurf); + struct v3d_resource *rsc = v3d_resource(psurf->texture); + +@@ -874,6 +964,20 @@ v3dX(emit_rcl)(struct v3d_job *job) + clear.render_target_number = i; + }; + } ++#endif ++#if V3D_VERSION >= 71 ++ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.clear_color_low_bits = job->clear_color[i][0]; ++ v3d_setup_render_target(job, i, &rt.internal_bpp, ++ &rt.internal_type_and_clamping); ++ rt.stride = ++ v3d_compute_rt_row_row_stride_128_bits(job->tile_width, ++ v3d_internal_bpp_words(rt.internal_bpp)); ++ rt.base_address = base_addr; ++ rt.render_target_number = i; ++ ++ base_addr += (job->tile_height * rt.stride) / 8; ++ } + #endif + } + +@@ -898,10 +1002,6 @@ v3dX(emit_rcl)(struct v3d_job *job) + } + #endif + +-#if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); +-#endif +- + #if V3D_VERSION < 40 + /* FIXME: Don't bother emitting if we don't load/clear Z/S. */ + if (job->zsbuf) { +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0109-v3d-emit-CLEAR_RENDER_TARGETS-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0109-v3d-emit-CLEAR_RENDER_TARGETS-for-v71.patch new file mode 100644 index 0000000000..fbb87ab660 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0109-v3d-emit-CLEAR_RENDER_TARGETS-for-v71.patch @@ -0,0 +1,60 @@ +From 8496282476420e7e5d9d31f6cfd87f3f3b136446 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 21 Oct 2021 01:47:29 +0200 +Subject: [PATCH 109/142] v3d: emit CLEAR_RENDER_TARGETS for v71 + +--- + src/gallium/drivers/v3d/v3dx_rcl.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c +index 815e1098c22..4274be042bd 100644 +--- a/src/gallium/drivers/v3d/v3dx_rcl.c ++++ b/src/gallium/drivers/v3d/v3dx_rcl.c +@@ -427,7 +427,7 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer) + } + #endif + #if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); ++ cl_emit(cl, CLEAR_RENDER_TARGETS, clear); + #endif + + } +@@ -734,7 +734,7 @@ emit_render_layer(struct v3d_job *job, uint32_t layer) + store.buffer_to_store = NONE; + } + #endif +-#if V3D_VERSION >= 40 && V3D_VERSION <= 42 ++#if V3D_VERSION >= 40 + for (int i = 0; i < 2; i++) { + if (i > 0) + cl_emit(&job->rcl, TILE_COORDINATES, coords); +@@ -742,20 +742,20 @@ emit_render_layer(struct v3d_job *job, uint32_t layer) + cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) { + store.buffer_to_store = NONE; + } ++ + if (i == 0 || do_double_initial_tile_clear(job)) { ++#if V3D_VERSION < 71 + cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) { + clear.clear_z_stencil_buffer = !job->early_zs_clear; + clear.clear_all_render_targets = true; + } ++#else ++ cl_emit(&job->rcl, CLEAR_RENDER_TARGETS, clear); ++#endif + } + cl_emit(&job->rcl, END_OF_TILE_MARKER, end); + } + #endif +-#if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); +-#endif +- +- + cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush); + + v3d_rcl_emit_generic_per_tile_list(job, layer); +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0110-v3d-just-don-t-fill-up-early-z-fields-for-CFG_BITS-f.patch b/projects/RPi/devices/RPi5/patches/mesa/0110-v3d-just-don-t-fill-up-early-z-fields-for-CFG_BITS-f.patch new file mode 100644 index 0000000000..e3dbb971af --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0110-v3d-just-don-t-fill-up-early-z-fields-for-CFG_BITS-f.patch @@ -0,0 +1,43 @@ +From 4de1ace1c7b3b6436a5de8e4c6a2f52d6308ff5c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 21 Oct 2021 13:09:03 +0200 +Subject: [PATCH 110/142] v3d: just don't fill up early-z fields for CFG_BITS + for v71 + +v71 doesn't include early_z_enable/early_z_updates_enable. They are +configured with packet 121. +--- + src/gallium/drivers/v3d/v3dx_emit.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c +index 5af3d03b337..de05ae29d04 100644 +--- a/src/gallium/drivers/v3d/v3dx_emit.c ++++ b/src/gallium/drivers/v3d/v3dx_emit.c +@@ -515,20 +515,19 @@ v3dX(emit_state)(struct pipe_context *pctx) + #if V3D_VERSION <= 42 + config.early_z_updates_enable = + (job->ez_state != V3D_EZ_DISABLED); ++#endif + if (v3d->zsa->base.depth_enabled) { + config.z_updates_enable = + v3d->zsa->base.depth_writemask; ++#if V3D_VERSION <= 42 + config.early_z_enable = + config.early_z_updates_enable; ++#endif + config.depth_test_function = + v3d->zsa->base.depth_func; + } else { + config.depth_test_function = PIPE_FUNC_ALWAYS; + } +-#endif +-#if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); +-#endif + + config.stencil_enable = + v3d->zsa->base.stencil[0].enabled; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0111-v3d-emit-CLIPPER_XY_SCALING-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0111-v3d-emit-CLIPPER_XY_SCALING-for-v71.patch new file mode 100644 index 0000000000..78e45af498 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0111-v3d-emit-CLIPPER_XY_SCALING-for-v71.patch @@ -0,0 +1,30 @@ +From 0683f6db1cd50659829fe53f49427bfdacb707b6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 21 Oct 2021 13:14:32 +0200 +Subject: [PATCH 111/142] v3d: emit CLIPPER_XY_SCALING for v71 + +--- + src/gallium/drivers/v3d/v3dx_emit.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c +index de05ae29d04..58c886bb29e 100644 +--- a/src/gallium/drivers/v3d/v3dx_emit.c ++++ b/src/gallium/drivers/v3d/v3dx_emit.c +@@ -577,7 +577,12 @@ v3dX(emit_state)(struct pipe_context *pctx) + } + #endif + #if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); ++ cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { ++ clip.viewport_half_width_in_1_64th_of_pixel = ++ v3d->viewport.scale[0] * 64.0f; ++ clip.viewport_half_height_in_1_64th_of_pixel = ++ v3d->viewport.scale[1] * 64.0f; ++ } + #endif + + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0112-v3d-no-specific-separate_segments-flag-for-V3D-7.1.patch b/projects/RPi/devices/RPi5/patches/mesa/0112-v3d-no-specific-separate_segments-flag-for-V3D-7.1.patch new file mode 100644 index 0000000000..cf420be0f5 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0112-v3d-no-specific-separate_segments-flag-for-V3D-7.1.patch @@ -0,0 +1,53 @@ +From 1d1aa5ce739644c72b44ffe547b7233ad19e26b5 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 21 Oct 2021 13:19:49 +0200 +Subject: [PATCH 112/142] v3d: no specific separate_segments flag for V3D 7.1 + +On V3D 7.1 there is not a flag on the Shader State Record to specify +if we are using shared or separate segments. This is done by setting +the vpm input size to 0 (so we need to ensure that the output would be +the max needed for input/output). + +We were already doing the latter on the prog_data_vs, so we just need +to use those values, instead of assigning default values. +--- + src/gallium/drivers/v3d/v3dx_draw.c | 17 ++++++++++++----- + 1 file changed, 12 insertions(+), 5 deletions(-) + +diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c +index 9f38baa0bbf..dd13e5177fe 100644 +--- a/src/gallium/drivers/v3d/v3dx_draw.c ++++ b/src/gallium/drivers/v3d/v3dx_draw.c +@@ -679,17 +679,24 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, + v3d->prog.cs->prog_data.vs->separate_segments; + shader.vertex_shader_has_separate_input_and_output_vpm_blocks = + v3d->prog.vs->prog_data.vs->separate_segments; +-#endif +-#if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); +-#endif +- + shader.coordinate_shader_input_vpm_segment_size = + v3d->prog.cs->prog_data.vs->separate_segments ? + v3d->prog.cs->prog_data.vs->vpm_input_size : 1; + shader.vertex_shader_input_vpm_segment_size = + v3d->prog.vs->prog_data.vs->separate_segments ? + v3d->prog.vs->prog_data.vs->vpm_input_size : 1; ++#endif ++ /* On V3D 7.1 there isn't a specific flag to set if we are using ++ * shared/separate segments or not. We just set the value of ++ * vpm_input_size to 0, and set output to the max needed. That should be ++ * already properly set on prog_data_vs_bin ++ */ ++#if V3D_VERSION == 71 ++ shader.coordinate_shader_input_vpm_segment_size = ++ v3d->prog.cs->prog_data.vs->vpm_input_size; ++ shader.vertex_shader_input_vpm_segment_size = ++ v3d->prog.vs->prog_data.vs->vpm_input_size; ++#endif + + shader.coordinate_shader_output_vpm_segment_size = + v3d->prog.cs->prog_data.vs->vpm_output_size; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0113-v3d-default-vertex-attributes-values-are-not-needed-.patch b/projects/RPi/devices/RPi5/patches/mesa/0113-v3d-default-vertex-attributes-values-are-not-needed-.patch new file mode 100644 index 0000000000..b3e7369ea0 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0113-v3d-default-vertex-attributes-values-are-not-needed-.patch @@ -0,0 +1,113 @@ +From 3a790ddd27c8406c59426599fb9cadb5de5c024d Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 21 Oct 2021 13:37:46 +0200 +Subject: [PATCH 113/142] v3d: default vertex attributes values are not needed + for v71 + +--- + src/gallium/drivers/v3d/v3d_context.h | 1 + + src/gallium/drivers/v3d/v3dx_draw.c | 3 -- + src/gallium/drivers/v3d/v3dx_state.c | 53 ++++++++++++++++++--------- + 3 files changed, 37 insertions(+), 20 deletions(-) + +diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h +index ad267d5033c..c0aac741fdc 100644 +--- a/src/gallium/drivers/v3d/v3d_context.h ++++ b/src/gallium/drivers/v3d/v3d_context.h +@@ -265,6 +265,7 @@ struct v3d_vertex_stateobj { + unsigned num_elements; + + uint8_t attrs[16 * (V3D_MAX_VS_INPUTS / 4)]; ++ /* defaults can be NULL for some hw generation */ + struct pipe_resource *defaults; + uint32_t defaults_offset; + }; +diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c +index dd13e5177fe..4bff2ea6478 100644 +--- a/src/gallium/drivers/v3d/v3dx_draw.c ++++ b/src/gallium/drivers/v3d/v3dx_draw.c +@@ -759,9 +759,6 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, + shader.address_of_default_attribute_values = + cl_address(v3d_resource(vtx->defaults)->bo, + vtx->defaults_offset); +-#endif +-#if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); + #endif + } + +diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c +index a93d5be091e..3d3c4fb0f47 100644 +--- a/src/gallium/drivers/v3d/v3dx_state.c ++++ b/src/gallium/drivers/v3d/v3dx_state.c +@@ -337,6 +337,20 @@ v3d_zsa_state_bind(struct pipe_context *pctx, void *hwcso) + v3d->dirty |= V3D_DIRTY_ZSA; + } + ++ ++static bool ++needs_default_attribute_values(void) ++{ ++#if V3D_VERSION <= 42 ++ /* FIXME: on vulkan we are able to refine even further, as we know in ++ * advance when we create the pipeline if we have a integer vertex ++ * attrib. Pending to check if we could do something similar here. ++ */ ++ return true; ++#endif ++ return false; ++} ++ + static void * + v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements, + const struct pipe_vertex_element *elements) +@@ -414,24 +428,29 @@ v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements, + } + } + +- /* Set up the default attribute values in case any of the vertex +- * elements use them. +- */ +- uint32_t *attrs; +- u_upload_alloc(v3d->state_uploader, 0, +- V3D_MAX_VS_INPUTS * sizeof(float), 16, +- &so->defaults_offset, &so->defaults, (void **)&attrs); +- +- for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) { +- attrs[i * 4 + 0] = 0; +- attrs[i * 4 + 1] = 0; +- attrs[i * 4 + 2] = 0; +- if (i < so->num_elements && +- util_format_is_pure_integer(so->pipe[i].src_format)) { +- attrs[i * 4 + 3] = 1; +- } else { +- attrs[i * 4 + 3] = fui(1.0); ++ if (needs_default_attribute_values()) { ++ /* Set up the default attribute values in case any of the vertex ++ * elements use them. ++ */ ++ uint32_t *attrs; ++ u_upload_alloc(v3d->state_uploader, 0, ++ V3D_MAX_VS_INPUTS * sizeof(float), 16, ++ &so->defaults_offset, &so->defaults, (void **)&attrs); ++ ++ for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) { ++ attrs[i * 4 + 0] = 0; ++ attrs[i * 4 + 1] = 0; ++ attrs[i * 4 + 2] = 0; ++ if (i < so->num_elements && ++ util_format_is_pure_integer(so->pipe[i].src_format)) { ++ attrs[i * 4 + 3] = 1; ++ } else { ++ attrs[i * 4 + 3] = fui(1.0); ++ } + } ++ } else { ++ so->defaults = NULL; ++ so->defaults_offset = 0; + } + + u_upload_unmap(v3d->state_uploader); +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0114-v3d-uniforms-update-VIEWPORT_X-Y_SCALE-uniforms-for-.patch b/projects/RPi/devices/RPi5/patches/mesa/0114-v3d-uniforms-update-VIEWPORT_X-Y_SCALE-uniforms-for-.patch new file mode 100644 index 0000000000..d197620253 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0114-v3d-uniforms-update-VIEWPORT_X-Y_SCALE-uniforms-for-.patch @@ -0,0 +1,100 @@ +From 8e3a2a35df5789687993d05436602821186e1cf2 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 21 Oct 2021 13:46:11 +0200 +Subject: [PATCH 114/142] v3d/uniforms: update VIEWPORT_X/Y_SCALE uniforms for + v71 + +As the packet CLIPPER_XY scaling, this needs to be computed on 1/64ths +of pixel, instead of 1/256ths of pixels. + +As this is the usual values that we get from macros, we add manually a +v42 and v71 macro, and define a new helper to get those. + +Those granularity values are the same for Vulkan and OpenGL, so +perhaps we should move them to a common place. + +As with v3dv, V3D_X macro name is somewhat confusing. It is +specifically created to ask for define values that depends on the +version. But I also felt that V3D_DEFINE_X was too long. +--- + src/gallium/drivers/v3d/v3d_context.h | 28 ++++++++++++++++++++++++++ + src/gallium/drivers/v3d/v3d_uniforms.c | 8 ++++++-- + 2 files changed, 34 insertions(+), 2 deletions(-) + +diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h +index c0aac741fdc..21ee10a90cc 100644 +--- a/src/gallium/drivers/v3d/v3d_context.h ++++ b/src/gallium/drivers/v3d/v3d_context.h +@@ -837,6 +837,34 @@ void v3d_disk_cache_store(struct v3d_context *v3d, + v3d_X_thing; \ + }) + ++/* FIXME: The same for vulkan/opengl. Common place? define it at the ++ * v3d_packet files? ++ */ ++#define V3D33_CLIPPER_XY_GRANULARITY 256.0f ++#define V3D42_CLIPPER_XY_GRANULARITY 256.0f ++#define V3D71_CLIPPER_XY_GRANULARITY 64.0f ++ ++/* Helper to get hw-specific macro values */ ++#define V3DV_X(devinfo, thing) ({ \ ++ __typeof(V3D33_##thing) V3D_X_THING; \ ++ switch (devinfo->ver) { \ ++ case 33: \ ++ case 40: \ ++ V3D_X_THING = V3D33_##thing; \ ++ break; \ ++ case 41: \ ++ case 42: \ ++ V3D_X_THING = V3D42_##thing; \ ++ break; \ ++ case 71: \ ++ V3D_X_THING = V3D71_##thing; \ ++ break; \ ++ default: \ ++ unreachable("Unsupported hardware generation"); \ ++ } \ ++ V3D_X_THING; \ ++}) ++ + #ifdef v3dX + # include "v3dx_context.h" + #else +diff --git a/src/gallium/drivers/v3d/v3d_uniforms.c b/src/gallium/drivers/v3d/v3d_uniforms.c +index 95eb838954f..1b8758bae7d 100644 +--- a/src/gallium/drivers/v3d/v3d_uniforms.c ++++ b/src/gallium/drivers/v3d/v3d_uniforms.c +@@ -261,6 +261,7 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job, + struct v3d_compiled_shader *shader, + enum pipe_shader_type stage) + { ++ struct v3d_device_info *devinfo = &v3d->screen->devinfo; + struct v3d_constbuf_stateobj *cb = &v3d->constbuf[stage]; + struct v3d_texture_stateobj *texstate = &v3d->tex[stage]; + struct v3d_uniform_list *uinfo = &shader->prog_data.base->uniforms; +@@ -282,6 +283,9 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job, + struct v3d_cl_out *uniforms = + cl_start(&job->indirect); + ++ float clipper_xy_granularity = ++ V3DV_X(devinfo, CLIPPER_XY_GRANULARITY); ++ + for (int i = 0; i < uinfo->count; i++) { + uint32_t data = uinfo->data[i]; + +@@ -293,10 +297,10 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job, + cl_aligned_u32(&uniforms, gallium_uniforms[data]); + break; + case QUNIFORM_VIEWPORT_X_SCALE: +- cl_aligned_f(&uniforms, v3d->viewport.scale[0] * 256.0f); ++ cl_aligned_f(&uniforms, v3d->viewport.scale[0] * clipper_xy_granularity); + break; + case QUNIFORM_VIEWPORT_Y_SCALE: +- cl_aligned_f(&uniforms, v3d->viewport.scale[1] * 256.0f); ++ cl_aligned_f(&uniforms, v3d->viewport.scale[1] * clipper_xy_granularity); + break; + + case QUNIFORM_VIEWPORT_Z_OFFSET: +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0115-v3d-handle-new-texture-state-transfer-functions-in-v.patch b/projects/RPi/devices/RPi5/patches/mesa/0115-v3d-handle-new-texture-state-transfer-functions-in-v.patch new file mode 100644 index 0000000000..e9f5e92927 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0115-v3d-handle-new-texture-state-transfer-functions-in-v.patch @@ -0,0 +1,43 @@ +From aa6f70116d9e7be56cdb52b55d75419bf7209185 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Thu, 21 Oct 2021 23:21:02 +0200 +Subject: [PATCH 115/142] v3d: handle new texture state transfer functions in + v71 + +--- + src/gallium/drivers/v3d/v3dx_state.c | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c +index 3d3c4fb0f47..b5e572b13c5 100644 +--- a/src/gallium/drivers/v3d/v3dx_state.c ++++ b/src/gallium/drivers/v3d/v3dx_state.c +@@ -1009,12 +1009,12 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d, + cso->u.buf.size); + } + ++ bool is_srgb = util_format_is_srgb(cso->format); + #if V3D_VERSION <= 42 +- tex.srgb = util_format_is_srgb(cso->format); ++ tex.srgb = is_srgb; + #endif +- + #if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); ++ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; + #endif + + #if V3D_VERSION >= 40 +@@ -1068,9 +1068,6 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d, + #if V3D_VERSION <= 42 + tex.srgb = false; + #endif +-#if V3D_VERSION >= 71 +- unreachable("HW generation 71 not supported yet."); +-#endif + + } else { + tex.texture_type = v3d_get_tex_format(&screen->devinfo, +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0116-v3d-handle-new-TEXTURE_SHADER_STATE-v71-YCbCr-fields.patch b/projects/RPi/devices/RPi5/patches/mesa/0116-v3d-handle-new-TEXTURE_SHADER_STATE-v71-YCbCr-fields.patch new file mode 100644 index 0000000000..2ce6d66bd2 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0116-v3d-handle-new-TEXTURE_SHADER_STATE-v71-YCbCr-fields.patch @@ -0,0 +1,62 @@ +From aefc98b6aefc38caa6f6efd421db6d02c42596a7 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Fri, 22 Oct 2021 10:54:24 +0200 +Subject: [PATCH 116/142] v3d: handle new TEXTURE_SHADER_STATE v71 YCbCr fields + +There are some new fields for YCbCr with pointers for the various +planes in multi-planar formats. These need to match the base address +pointer in the texture state, or the hardware will assume this is a +multi-planar texture. + +Notice we don't use an address type for these fields in the XML +description. This is because the addresses are 64-bit aligned (even +though the PRM doesn't say it) which means the 6 LSB bits are +implicitly 0, but the fields are encoded before the 6th bit of their +starting byte, so we can't use the usual trick we do with address +types where the first 6 bits in the byte are implicitly overwritten by +other fields and we have to encode this manually as a uint field. This +would mean that if we had an actual BO we would also need to add it +manually to the job's list, but since we don't have one, we don't have +to do anything about it. +--- + src/gallium/drivers/v3d/v3dx_state.c | 17 +++++++++++++---- + 1 file changed, 13 insertions(+), 4 deletions(-) + +diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c +index b5e572b13c5..c08a072157b 100644 +--- a/src/gallium/drivers/v3d/v3dx_state.c ++++ b/src/gallium/drivers/v3d/v3dx_state.c +@@ -936,17 +936,26 @@ v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex, + } + + tex->base_level = base_level; ++ + #if V3D_VERSION >= 40 + tex->max_level = last_level; + /* Note that we don't have a job to reference the texture's sBO + * at state create time, so any time this sampler view is used + * we need to add the texture to the job. + */ +- tex->texture_base_pointer = +- cl_address(NULL, +- rsc->bo->offset + +- v3d_layer_offset(prsc, 0, first_layer)); ++ const uint32_t base_offset = rsc->bo->offset + ++ v3d_layer_offset(prsc, 0, first_layer); ++ ++ tex->texture_base_pointer = cl_address(NULL, base_offset); + #endif ++#if V3D_VERSION >= 71 ++ tex->chroma_offset_x = 1; ++ tex->chroma_offset_y = 1; ++ /* See comment in XML field definition for rationale of the shifts */ ++ tex->texture_base_pointer_cb = base_offset >> 6; ++ tex->texture_base_pointer_cr = base_offset >> 6; ++#endif ++ + tex->array_stride_64_byte_aligned = rsc->cube_map_stride / 64; + + /* Since other platform devices may produce UIF images even +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0117-v3d-setup-render-pass-color-clears-for-any-format-bp.patch b/projects/RPi/devices/RPi5/patches/mesa/0117-v3d-setup-render-pass-color-clears-for-any-format-bp.patch new file mode 100644 index 0000000000..5f7cdbd03f --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0117-v3d-setup-render-pass-color-clears-for-any-format-bp.patch @@ -0,0 +1,42 @@ +From fcb3fc1ead4344da59c4b26a81878d53f8f4a291 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Fri, 22 Oct 2021 11:40:49 +0200 +Subject: [PATCH 117/142] v3d: setup render pass color clears for any format + bpp in v71 + +--- + src/gallium/drivers/v3d/v3dx_rcl.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c +index 4274be042bd..d3fbc9aff5d 100644 +--- a/src/gallium/drivers/v3d/v3dx_rcl.c ++++ b/src/gallium/drivers/v3d/v3dx_rcl.c +@@ -978,6 +978,24 @@ v3dX(emit_rcl)(struct v3d_job *job) + + base_addr += (job->tile_height * rt.stride) / 8; + } ++ ++ if (surf->internal_bpp >= V3D_INTERNAL_BPP_64) { ++ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) { ++ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */ ++ ((uint64_t) job->clear_color[i][1]) | ++ (((uint64_t) (job->clear_color[i][2] & 0xff)) << 32); ++ rt.render_target_number = i; ++ } ++ } ++ ++ if (surf->internal_bpp >= V3D_INTERNAL_BPP_128) { ++ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) { ++ rt.clear_color_top_bits = /* 56 bits (24 + 32) */ ++ (((uint64_t) (job->clear_color[i][2] & 0xffffff00)) >> 8) | ++ (((uint64_t) (job->clear_color[i][3])) << 24); ++ rt.render_target_number = i; ++ } ++ } + #endif + } + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0118-v3d-GFX-1461-does-not-affect-V3D-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0118-v3d-GFX-1461-does-not-affect-V3D-7.x.patch new file mode 100644 index 0000000000..56e27cf09c --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0118-v3d-GFX-1461-does-not-affect-V3D-7.x.patch @@ -0,0 +1,29 @@ +From ceb088c05f351b40df14069bd6e0de777288ece4 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Fri, 22 Oct 2021 12:17:45 +0200 +Subject: [PATCH 118/142] v3d: GFX-1461 does not affect V3D 7.x + +--- + src/gallium/drivers/v3d/v3dx_draw.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c +index 4bff2ea6478..04cc3bc3ae1 100644 +--- a/src/gallium/drivers/v3d/v3dx_draw.c ++++ b/src/gallium/drivers/v3d/v3dx_draw.c +@@ -1593,9 +1593,10 @@ v3d_tlb_clear(struct v3d_job *job, unsigned buffers, + /* GFXH-1461: If we were to emit a load of just depth or just stencil, + * then the clear for the other may get lost. We need to decide now + * if it would be possible to need to emit a load of just one after +- * we've set up our TLB clears. ++ * we've set up our TLB clears. This issue is fixed since V3D 4.3.18. + */ +- if (buffers & PIPE_CLEAR_DEPTHSTENCIL && ++ if (v3d->screen->devinfo.ver <= 42 && ++ buffers & PIPE_CLEAR_DEPTHSTENCIL && + (buffers & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL && + job->zsbuf && + util_format_is_depth_and_stencil(job->zsbuf->texture->format)) { +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0119-v3d-don-t-convert-floating-point-border-colors-in-v7.patch b/projects/RPi/devices/RPi5/patches/mesa/0119-v3d-don-t-convert-floating-point-border-colors-in-v7.patch new file mode 100644 index 0000000000..c3cdfc0355 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0119-v3d-don-t-convert-floating-point-border-colors-in-v7.patch @@ -0,0 +1,55 @@ +From b44a8785c5436fb28b6734d3bac806d3a82c828d Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Fri, 22 Oct 2021 13:41:09 +0200 +Subject: [PATCH 119/142] v3d: don't convert floating point border colors in + v71 + +The TMU does this for us now. +--- + src/gallium/drivers/v3d/v3dx_state.c | 29 ++++++++++++++-------------- + 1 file changed, 15 insertions(+), 14 deletions(-) + +diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c +index c08a072157b..348a7bcf3da 100644 +--- a/src/gallium/drivers/v3d/v3dx_state.c ++++ b/src/gallium/drivers/v3d/v3dx_state.c +@@ -718,21 +718,22 @@ v3d_upload_sampler_state_variant(void *map, + break; + } + +- if (variant >= V3D_SAMPLER_STATE_32) { +- sampler.border_color_word_0 = border.ui[0]; +- sampler.border_color_word_1 = border.ui[1]; +- sampler.border_color_word_2 = border.ui[2]; +- sampler.border_color_word_3 = border.ui[3]; +- } else { +- sampler.border_color_word_0 = +- _mesa_float_to_half(border.f[0]); +- sampler.border_color_word_1 = +- _mesa_float_to_half(border.f[1]); +- sampler.border_color_word_2 = +- _mesa_float_to_half(border.f[2]); +- sampler.border_color_word_3 = +- _mesa_float_to_half(border.f[3]); ++#if V3D_VERSION <= 42 ++ /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions ++ * for us. In V3D 4.x we need to manually convert floating point color ++ * values to the expected format. ++ */ ++ if (variant < V3D_SAMPLER_STATE_32) { ++ border.ui[0] = _mesa_float_to_half(border.f[0]); ++ border.ui[1] = _mesa_float_to_half(border.f[1]); ++ border.ui[2] = _mesa_float_to_half(border.f[2]); ++ border.ui[3] = _mesa_float_to_half(border.f[3]); + } ++#endif ++ sampler.border_color_word_0 = border.ui[0]; ++ sampler.border_color_word_1 = border.ui[1]; ++ sampler.border_color_word_2 = border.ui[2]; ++ sampler.border_color_word_3 = border.ui[3]; + } + } + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0120-v3d-handle-Z-clipping-in-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0120-v3d-handle-Z-clipping-in-v71.patch new file mode 100644 index 0000000000..ef5d2ade88 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0120-v3d-handle-Z-clipping-in-v71.patch @@ -0,0 +1,39 @@ +From ecc1a5fa6b09a684a1e831c342121ec417f1a101 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Fri, 22 Oct 2021 14:26:29 +0200 +Subject: [PATCH 120/142] v3d: handle Z clipping in v71 + +--- + src/gallium/drivers/v3d/v3dx_emit.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c +index 58c886bb29e..75751dc9ab6 100644 +--- a/src/gallium/drivers/v3d/v3dx_emit.c ++++ b/src/gallium/drivers/v3d/v3dx_emit.c +@@ -539,8 +539,21 @@ v3dX(emit_state)(struct pipe_context *pctx) + v3d_line_smoothing_enabled(v3d) ? + V3D_LINE_RASTERIZATION_PERP_END_CAPS : + V3D_LINE_RASTERIZATION_DIAMOND_EXIT; +- } + ++#if V3D_VERSION >= 71 ++ /* The following follows the logic implemented at v3dv ++ * plus the definition of depth_clip_near/far and ++ * depth_clamp. ++ * ++ * Note: some extensions are not supported by v3d ++ * (like ARB_depth_clamp) that would affect this, but ++ * the values on rasterizer are taking that into ++ * account. ++ */ ++ config.z_clipping_mode = v3d->rasterizer->base.depth_clip_near || ++ v3d->rasterizer->base.depth_clip_far; ++#endif ++ } + } + + if (v3d->dirty & V3D_DIRTY_RASTERIZER && +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0121-v3d-add-support-for-TFU-blit-in-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0121-v3d-add-support-for-TFU-blit-in-v71.patch new file mode 100644 index 0000000000..8275072cbe --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0121-v3d-add-support-for-TFU-blit-in-v71.patch @@ -0,0 +1,446 @@ +From ecac3d8441b75011446b566320194df17beba352 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Wed, 27 Oct 2021 02:03:10 +0200 +Subject: [PATCH 121/142] v3d: add support for TFU blit in v71 + +TFU has changed on v71, specially on which registers to use, so that +means that support code change across versions. So as part of this +commit TFU copying is moved to a v3dx file. +--- + src/gallium/drivers/v3d/meson.build | 1 + + src/gallium/drivers/v3d/v3d_blit.c | 164 +++----------------- + src/gallium/drivers/v3d/v3dx_context.h | 10 ++ + src/gallium/drivers/v3d/v3dx_tfu.c | 202 +++++++++++++++++++++++++ + 4 files changed, 232 insertions(+), 145 deletions(-) + create mode 100644 src/gallium/drivers/v3d/v3dx_tfu.c + +diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build +index 526a131ae9b..b2e748573b7 100644 +--- a/src/gallium/drivers/v3d/meson.build ++++ b/src/gallium/drivers/v3d/meson.build +@@ -49,6 +49,7 @@ files_per_version = files( + 'v3dx_job.c', + 'v3dx_rcl.c', + 'v3dx_state.c', ++ 'v3dx_tfu.c', + ) + + v3d_args = ['-DV3D_BUILD_NEON'] +diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c +index 0260bdde6d1..96179f654a4 100644 +--- a/src/gallium/drivers/v3d/v3d_blit.c ++++ b/src/gallium/drivers/v3d/v3d_blit.c +@@ -210,140 +210,6 @@ v3d_stencil_blit(struct pipe_context *ctx, struct pipe_blit_info *info) + info->mask &= ~PIPE_MASK_S; + } + +-static bool +-v3d_tfu(struct pipe_context *pctx, +- struct pipe_resource *pdst, +- struct pipe_resource *psrc, +- unsigned int src_level, +- unsigned int base_level, +- unsigned int last_level, +- unsigned int src_layer, +- unsigned int dst_layer, +- bool for_mipmap) +-{ +- struct v3d_context *v3d = v3d_context(pctx); +- struct v3d_screen *screen = v3d->screen; +- struct v3d_resource *src = v3d_resource(psrc); +- struct v3d_resource *dst = v3d_resource(pdst); +- struct v3d_resource_slice *src_base_slice = &src->slices[src_level]; +- struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level]; +- int msaa_scale = pdst->nr_samples > 1 ? 2 : 1; +- int width = u_minify(pdst->width0, base_level) * msaa_scale; +- int height = u_minify(pdst->height0, base_level) * msaa_scale; +- enum pipe_format pformat; +- +- if (psrc->format != pdst->format) +- return false; +- if (psrc->nr_samples != pdst->nr_samples) +- return false; +- +- /* Can't write to raster. */ +- if (dst_base_slice->tiling == V3D_TILING_RASTER) +- return false; +- +- /* When using TFU for blit, we are doing exact copies (both input and +- * output format must be the same, no scaling, etc), so there is no +- * pixel format conversions. Thus we can rewrite the format to use one +- * that is TFU compatible based on its texel size. +- */ +- if (for_mipmap) { +- pformat = pdst->format; +- } else { +- switch (dst->cpp) { +- case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT; break; +- case 8: pformat = PIPE_FORMAT_R16G16B16A16_FLOAT; break; +- case 4: pformat = PIPE_FORMAT_R32_FLOAT; break; +- case 2: pformat = PIPE_FORMAT_R16_FLOAT; break; +- case 1: pformat = PIPE_FORMAT_R8_UNORM; break; +- default: unreachable("unsupported format bit-size"); break; +- }; +- } +- +- uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat); +- struct v3d_device_info *devinfo = &screen->devinfo; +- +- if (!v3d_X(devinfo, tfu_supports_tex_format)(tex_format, for_mipmap)) { +- assert(for_mipmap); +- return false; +- } +- +- v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false); +- v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false); +- +- struct drm_v3d_submit_tfu tfu = { +- .ios = (height << 16) | width, +- .bo_handles = { +- dst->bo->handle, +- src != dst ? src->bo->handle : 0 +- }, +- .in_sync = v3d->out_sync, +- .out_sync = v3d->out_sync, +- }; +- uint32_t src_offset = (src->bo->offset + +- v3d_layer_offset(psrc, src_level, src_layer)); +- tfu.iia |= src_offset; +- if (src_base_slice->tiling == V3D_TILING_RASTER) { +- tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER << +- V3D33_TFU_ICFG_FORMAT_SHIFT); +- } else { +- tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE + +- (src_base_slice->tiling - V3D_TILING_LINEARTILE)) << +- V3D33_TFU_ICFG_FORMAT_SHIFT); +- } +- +- uint32_t dst_offset = (dst->bo->offset + +- v3d_layer_offset(pdst, base_level, dst_layer)); +- tfu.ioa |= dst_offset; +- if (last_level != base_level) +- tfu.ioa |= V3D33_TFU_IOA_DIMTW; +- tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE + +- (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) << +- V3D33_TFU_IOA_FORMAT_SHIFT); +- +- tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT; +- tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT; +- +- switch (src_base_slice->tiling) { +- case V3D_TILING_UIF_NO_XOR: +- case V3D_TILING_UIF_XOR: +- tfu.iis |= (src_base_slice->padded_height / +- (2 * v3d_utile_height(src->cpp))); +- break; +- case V3D_TILING_RASTER: +- tfu.iis |= src_base_slice->stride / src->cpp; +- break; +- case V3D_TILING_LINEARTILE: +- case V3D_TILING_UBLINEAR_1_COLUMN: +- case V3D_TILING_UBLINEAR_2_COLUMN: +- break; +- } +- +- /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the +- * OPAD field for the destination (how many extra UIF blocks beyond +- * those necessary to cover the height). When filling mipmaps, the +- * miplevel 1+ tiling state is inferred. +- */ +- if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR || +- dst_base_slice->tiling == V3D_TILING_UIF_XOR) { +- int uif_block_h = 2 * v3d_utile_height(dst->cpp); +- int implicit_padded_height = align(height, uif_block_h); +- +- tfu.icfg |= (((dst_base_slice->padded_height - +- implicit_padded_height) / uif_block_h) << +- V3D33_TFU_ICFG_OPAD_SHIFT); +- } +- +- int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu); +- if (ret != 0) { +- fprintf(stderr, "Failed to submit TFU job: %d\n", ret); +- return false; +- } +- +- dst->writes++; +- +- return true; +-} +- + bool + v3d_generate_mipmap(struct pipe_context *pctx, + struct pipe_resource *prsc, +@@ -362,12 +228,16 @@ v3d_generate_mipmap(struct pipe_context *pctx, + if (first_layer != last_layer) + return false; + +- return v3d_tfu(pctx, +- prsc, prsc, +- base_level, +- base_level, last_level, +- first_layer, first_layer, +- true); ++ struct v3d_context *v3d = v3d_context(pctx); ++ struct v3d_screen *screen = v3d->screen; ++ struct v3d_device_info *devinfo = &screen->devinfo; ++ ++ return v3d_X(devinfo, tfu)(pctx, ++ prsc, prsc, ++ base_level, ++ base_level, last_level, ++ first_layer, first_layer, ++ true); + } + + static void +@@ -396,11 +266,15 @@ v3d_tfu_blit(struct pipe_context *pctx, struct pipe_blit_info *info) + if (info->dst.format != info->src.format) + return; + +- if (v3d_tfu(pctx, info->dst.resource, info->src.resource, +- info->src.level, +- info->dst.level, info->dst.level, +- info->src.box.z, info->dst.box.z, +- false)) { ++ struct v3d_context *v3d = v3d_context(pctx); ++ struct v3d_screen *screen = v3d->screen; ++ struct v3d_device_info *devinfo = &screen->devinfo; ++ ++ if (v3d_X(devinfo, tfu)(pctx, info->dst.resource, info->src.resource, ++ info->src.level, ++ info->dst.level, info->dst.level, ++ info->src.box.z, info->dst.box.z, ++ false)) { + info->mask &= ~PIPE_MASK_RGBA; + } + } +diff --git a/src/gallium/drivers/v3d/v3dx_context.h b/src/gallium/drivers/v3d/v3dx_context.h +index 03d7c244ea2..e0a5cbfb2f3 100644 +--- a/src/gallium/drivers/v3d/v3dx_context.h ++++ b/src/gallium/drivers/v3d/v3dx_context.h +@@ -51,3 +51,13 @@ void v3dX(get_internal_type_bpp_for_output_format)(uint32_t format, + */ + bool v3dX(tfu_supports_tex_format)(uint32_t tex_format, + bool for_mipmap); ++ ++bool v3dX(tfu)(struct pipe_context *pctx, ++ struct pipe_resource *pdst, ++ struct pipe_resource *psrc, ++ unsigned int src_level, ++ unsigned int base_level, ++ unsigned int last_level, ++ unsigned int src_layer, ++ unsigned int dst_layer, ++ bool for_mipmap); +diff --git a/src/gallium/drivers/v3d/v3dx_tfu.c b/src/gallium/drivers/v3d/v3dx_tfu.c +new file mode 100644 +index 00000000000..d6b51390a11 +--- /dev/null ++++ b/src/gallium/drivers/v3d/v3dx_tfu.c +@@ -0,0 +1,202 @@ ++/* ++ * Copyright © 2021 Broadcom ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "v3d_context.h" ++#include "broadcom/common/v3d_tfu.h" ++ ++bool ++v3dX(tfu)(struct pipe_context *pctx, ++ struct pipe_resource *pdst, ++ struct pipe_resource *psrc, ++ unsigned int src_level, ++ unsigned int base_level, ++ unsigned int last_level, ++ unsigned int src_layer, ++ unsigned int dst_layer, ++ bool for_mipmap) ++{ ++ struct v3d_context *v3d = v3d_context(pctx); ++ struct v3d_screen *screen = v3d->screen; ++ struct v3d_resource *src = v3d_resource(psrc); ++ struct v3d_resource *dst = v3d_resource(pdst); ++ struct v3d_resource_slice *src_base_slice = &src->slices[src_level]; ++ struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level]; ++ int msaa_scale = pdst->nr_samples > 1 ? 2 : 1; ++ int width = u_minify(pdst->width0, base_level) * msaa_scale; ++ int height = u_minify(pdst->height0, base_level) * msaa_scale; ++ enum pipe_format pformat; ++ ++ if (psrc->format != pdst->format) ++ return false; ++ if (psrc->nr_samples != pdst->nr_samples) ++ return false; ++ ++ if (pdst->target != PIPE_TEXTURE_2D || psrc->target != PIPE_TEXTURE_2D) ++ return false; ++ ++ /* Can't write to raster. */ ++ if (dst_base_slice->tiling == V3D_TILING_RASTER) ++ return false; ++ ++ /* When using TFU for blit, we are doing exact copies (both input and ++ * output format must be the same, no scaling, etc), so there is no ++ * pixel format conversions. Thus we can rewrite the format to use one ++ * that is TFU compatible based on its texel size. ++ */ ++ if (for_mipmap) { ++ pformat = pdst->format; ++ } else { ++ switch (dst->cpp) { ++ case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT; break; ++ case 8: pformat = PIPE_FORMAT_R16G16B16A16_FLOAT; break; ++ case 4: pformat = PIPE_FORMAT_R32_FLOAT; break; ++ case 2: pformat = PIPE_FORMAT_R16_FLOAT; break; ++ case 1: pformat = PIPE_FORMAT_R8_UNORM; break; ++ default: unreachable("unsupported format bit-size"); break; ++ }; ++ } ++ ++ uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat); ++ ++ if (!v3dX(tfu_supports_tex_format)(tex_format, for_mipmap)) { ++ assert(for_mipmap); ++ return false; ++ } ++ ++ v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false); ++ v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false); ++ ++ struct drm_v3d_submit_tfu tfu = { ++ .ios = (height << 16) | width, ++ .bo_handles = { ++ dst->bo->handle, ++ src != dst ? src->bo->handle : 0 ++ }, ++ .in_sync = v3d->out_sync, ++ .out_sync = v3d->out_sync, ++ }; ++ uint32_t src_offset = (src->bo->offset + ++ v3d_layer_offset(psrc, src_level, src_layer)); ++ tfu.iia |= src_offset; ++ ++ uint32_t dst_offset = (dst->bo->offset + ++ v3d_layer_offset(pdst, base_level, dst_layer)); ++ tfu.ioa |= dst_offset; ++ ++ switch (src_base_slice->tiling) { ++ case V3D_TILING_UIF_NO_XOR: ++ case V3D_TILING_UIF_XOR: ++ tfu.iis |= (src_base_slice->padded_height / ++ (2 * v3d_utile_height(src->cpp))); ++ break; ++ case V3D_TILING_RASTER: ++ tfu.iis |= src_base_slice->stride / src->cpp; ++ break; ++ case V3D_TILING_LINEARTILE: ++ case V3D_TILING_UBLINEAR_1_COLUMN: ++ case V3D_TILING_UBLINEAR_2_COLUMN: ++ break; ++ } ++ ++#if V3D_VERSION <= 42 ++ if (src_base_slice->tiling == V3D_TILING_RASTER) { ++ tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER << ++ V3D33_TFU_ICFG_FORMAT_SHIFT); ++ } else { ++ tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE + ++ (src_base_slice->tiling - V3D_TILING_LINEARTILE)) << ++ V3D33_TFU_ICFG_FORMAT_SHIFT); ++ } ++ tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT; ++ ++ if (last_level != base_level) ++ tfu.ioa |= V3D33_TFU_IOA_DIMTW; ++ ++ tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE + ++ (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) << ++ V3D33_TFU_IOA_FORMAT_SHIFT); ++ ++ tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT; ++ ++ /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the ++ * OPAD field for the destination (how many extra UIF blocks beyond ++ * those necessary to cover the height). When filling mipmaps, the ++ * miplevel 1+ tiling state is inferred. ++ */ ++ if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR || ++ dst_base_slice->tiling == V3D_TILING_UIF_XOR) { ++ int uif_block_h = 2 * v3d_utile_height(dst->cpp); ++ int implicit_padded_height = align(height, uif_block_h); ++ ++ tfu.icfg |= (((dst_base_slice->padded_height - ++ implicit_padded_height) / uif_block_h) << ++ V3D33_TFU_ICFG_OPAD_SHIFT); ++ } ++#endif /* V3D_VERSION <= 42 */ ++ ++#if V3D_VERSION >= 71 ++ if (src_base_slice->tiling == V3D_TILING_RASTER) { ++ tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT; ++ } else { ++ tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE + ++ (src_base_slice->tiling - V3D_TILING_LINEARTILE)) << ++ V3D71_TFU_ICFG_IFORMAT_SHIFT; ++ } ++ tfu.icfg |= tex_format << V3D71_TFU_ICFG_OTYPE_SHIFT; ++ ++ if (last_level != base_level) ++ tfu.v71.ioc |= V3D71_TFU_IOC_DIMTW; ++ ++ tfu.v71.ioc |= ((V3D71_TFU_IOC_FORMAT_LINEARTILE + ++ (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) << ++ V3D71_TFU_IOC_FORMAT_SHIFT); ++ ++ switch (dst_base_slice->tiling) { ++ case V3D_TILING_UIF_NO_XOR: ++ case V3D_TILING_UIF_XOR: ++ tfu.v71.ioc |= ++ (dst_base_slice->padded_height / (2 * v3d_utile_height(dst->cpp))) << ++ V3D71_TFU_IOC_STRIDE_SHIFT; ++ break; ++ case V3D_TILING_RASTER: ++ tfu.v71.ioc |= (dst_base_slice->padded_height / dst->cpp) << ++ V3D71_TFU_IOC_STRIDE_SHIFT; ++ break; ++ default: ++ break; ++ } ++ ++ tfu.v71.ioc |= (last_level - base_level) << V3D71_TFU_IOC_NUMMM_SHIFT; ++#endif /* V3D_VERSION >= 71*/ ++ ++ int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu); ++ if (ret != 0) { ++ fprintf(stderr, "Failed to submit TFU job: %d\n", ret); ++ return false; ++ } ++ ++ dst->writes++; ++ ++ return true; ++} ++ +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0122-v3d-v3dv-fix-texture-state-array-stride-packing-for-.patch b/projects/RPi/devices/RPi5/patches/mesa/0122-v3d-v3dv-fix-texture-state-array-stride-packing-for-.patch new file mode 100644 index 0000000000..105a224f18 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0122-v3d-v3dv-fix-texture-state-array-stride-packing-for-.patch @@ -0,0 +1,91 @@ +From ed7e118a6cc0c9bba9f02929e98bc51252331950 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 16 May 2023 00:28:27 +0200 +Subject: [PATCH 122/142] v3d/v3dv: fix texture state array stride packing for + V3D 7.1.5 + +--- + src/broadcom/vulkan/v3dvx_image.c | 7 +++++++ + src/gallium/drivers/v3d/v3dx_state.c | 20 +++++++++++++++----- + 2 files changed, 22 insertions(+), 5 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c +index 437d4588c7e..ae6eaa88d0c 100644 +--- a/src/broadcom/vulkan/v3dvx_image.c ++++ b/src/broadcom/vulkan/v3dvx_image.c +@@ -118,6 +118,13 @@ pack_texture_shader_state_helper(struct v3dv_device *device, + #endif + #if V3D_VERSION >= 71 + tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; ++ ++ /* V3D 7.1.5 has array stride starting one bit later than previous ++ * V3D versions to make room for the new RB swap bit, but we don't ++ * handle that in the CLE parser. ++ */ ++ if (device->devinfo.rev >= 5) ++ tex.array_stride_64_byte_aligned <<= 1; + #endif + + /* At this point we don't have the job. That's the reason the first +diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c +index 348a7bcf3da..88e57cd072b 100644 +--- a/src/gallium/drivers/v3d/v3dx_state.c ++++ b/src/gallium/drivers/v3d/v3dx_state.c +@@ -889,7 +889,8 @@ v3d_setup_texture_shader_state_from_buffer(struct V3DX(TEXTURE_SHADER_STATE) *te + } + + static void +-v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex, ++v3d_setup_texture_shader_state(const struct v3d_device_info *devinfo, ++ struct V3DX(TEXTURE_SHADER_STATE) *tex, + struct pipe_resource *prsc, + int base_level, int last_level, + int first_layer, int last_layer, +@@ -949,15 +950,22 @@ v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex, + + tex->texture_base_pointer = cl_address(NULL, base_offset); + #endif ++ ++ tex->array_stride_64_byte_aligned = rsc->cube_map_stride / 64; ++ + #if V3D_VERSION >= 71 + tex->chroma_offset_x = 1; + tex->chroma_offset_y = 1; + /* See comment in XML field definition for rationale of the shifts */ + tex->texture_base_pointer_cb = base_offset >> 6; + tex->texture_base_pointer_cr = base_offset >> 6; +-#endif + +- tex->array_stride_64_byte_aligned = rsc->cube_map_stride / 64; ++ /* V3D 7.1.5 has array stride start at bit 33 instead of bit 32 to ++ * make room for the RB swap bit. ++ */ ++ if (devinfo->rev >= 5) ++ tex->array_stride_64_byte_aligned <<= 1; ++#endif + + /* Since other platform devices may produce UIF images even + * when they're not big enough for V3D to assume they're UIF, +@@ -1006,7 +1014,8 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d, + + v3dx_pack(map, TEXTURE_SHADER_STATE, tex) { + if (prsc->target != PIPE_BUFFER) { +- v3d_setup_texture_shader_state(&tex, prsc, ++ v3d_setup_texture_shader_state(&v3d->screen->devinfo, ++ &tex, prsc, + cso->u.tex.first_level, + cso->u.tex.last_level, + cso->u.tex.first_layer, +@@ -1442,7 +1451,8 @@ v3d_create_image_view_texture_shader_state(struct v3d_context *v3d, + + v3dx_pack(map, TEXTURE_SHADER_STATE, tex) { + if (prsc->target != PIPE_BUFFER) { +- v3d_setup_texture_shader_state(&tex, prsc, ++ v3d_setup_texture_shader_state(&v3d->screen->devinfo, ++ &tex, prsc, + iview->base.u.tex.level, + iview->base.u.tex.level, + iview->base.u.tex.first_layer, +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0123-v3d-v3dv-support-up-to-8-render-targets-in-v7.1.patch b/projects/RPi/devices/RPi5/patches/mesa/0123-v3d-v3dv-support-up-to-8-render-targets-in-v7.1.patch new file mode 100644 index 0000000000..19cffa9495 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0123-v3d-v3dv-support-up-to-8-render-targets-in-v7.1.patch @@ -0,0 +1,499 @@ +From 48893b056a07b7eda4fe3dea7f068c403981b621 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Fri, 12 Nov 2021 10:35:59 +0100 +Subject: [PATCH 123/142] v3d,v3dv: support up to 8 render targets in v7.1+ + +--- + src/broadcom/common/v3d_limits.h | 3 +- + src/broadcom/common/v3d_util.c | 49 ++++++++++++++++++++++++-- + src/broadcom/common/v3d_util.h | 6 ++-- + src/broadcom/compiler/nir_to_vir.c | 10 +++--- + src/broadcom/vulkan/v3dv_cmd_buffer.c | 5 +-- + src/broadcom/vulkan/v3dv_device.c | 6 ++-- + src/broadcom/vulkan/v3dv_limits.h | 2 -- + src/broadcom/vulkan/v3dv_meta_clear.c | 8 +++-- + src/broadcom/vulkan/v3dv_pass.c | 6 ++-- + src/broadcom/vulkan/v3dv_pipeline.c | 4 ++- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 7 ++-- + src/broadcom/vulkan/v3dvx_device.c | 1 - + src/gallium/drivers/v3d/v3d_blit.c | 2 +- + src/gallium/drivers/v3d/v3d_context.c | 5 +-- + src/gallium/drivers/v3d/v3d_context.h | 3 +- + src/gallium/drivers/v3d/v3d_job.c | 6 ++-- + src/gallium/drivers/v3d/v3d_screen.c | 3 +- + src/gallium/drivers/v3d/v3dx_emit.c | 14 +++++--- + src/gallium/drivers/v3d/v3dx_state.c | 5 +-- + 19 files changed, 104 insertions(+), 41 deletions(-) + +diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h +index 46f38bd7484..354c8784914 100644 +--- a/src/broadcom/common/v3d_limits.h ++++ b/src/broadcom/common/v3d_limits.h +@@ -42,7 +42,8 @@ + + #define V3D_MAX_SAMPLES 4 + +-#define V3D_MAX_DRAW_BUFFERS 4 ++#define V3D_MAX_DRAW_BUFFERS 8 ++#define V3D_MAX_RENDER_TARGETS(ver) (ver < 71 ? 4 : 8) + + #define V3D_MAX_POINT_SIZE 512.0f + #define V3D_MAX_LINE_WIDTH 32 +diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c +index 26f5c6b336f..209a5eceaa1 100644 +--- a/src/broadcom/common/v3d_util.c ++++ b/src/broadcom/common/v3d_util.c +@@ -88,8 +88,10 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, + } + + void +-v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp, +- bool msaa, bool double_buffer, ++v3d_choose_tile_size(const struct v3d_device_info *devinfo, ++ uint32_t color_attachment_count, ++ uint32_t max_color_bpp, bool msaa, ++ bool double_buffer, + uint32_t *width, uint32_t *height) + { + static const uint8_t tile_sizes[] = { +@@ -103,7 +105,9 @@ v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp, + }; + + uint32_t idx = 0; +- if (color_attachment_count > 2) ++ if (color_attachment_count > 4) ++ idx += 3; ++ else if (color_attachment_count > 2) + idx += 2; + else if (color_attachment_count > 1) + idx += 1; +@@ -117,6 +121,45 @@ v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp, + + idx += max_color_bpp; + ++ if (devinfo->ver >= 71) { ++ /* In V3D 7.x the TLB has an auxiliary buffer of 8KB that will be ++ * automatically used for depth instead of the main 16KB depth TLB buffer ++ * when the depth tile fits in the auxiliary buffer, allowing the hardware ++ * to allocate the 16KB from the main depth TLB to the color TLB. If ++ * we can do that, then we are effectively doubling the memory we have ++ * for color and we can increase our tile dimensions by a factor of 2 ++ * (reduce idx by 1). ++ * ++ * If we have computed a tile size that would be smaller than the minimum ++ * of 8x8, then it is certain that depth will fit in the aux depth TLB ++ * (even in MSAA mode). ++ * ++ * Otherwise, we need check if we can fit depth in the aux TLB buffer ++ * using a larger tile size. ++ * ++ * FIXME: the docs state that depth TLB memory can be used for color ++ * if depth testing is not used by setting the 'depth disable' bit in the ++ * rendering configuration. However, this comes with a requirement that ++ * occlussion queries must not be active. We need to clarify if this means ++ * active at the point at which we emit a tile rendering configuration ++ * item, meaning that the we have a query spanning a full render pass ++ * (this is something we can tell before we emit the rendering ++ * configuration item) or active in the subpass for which we are enabling ++ * the bit (which we can't tell until later, when we record commands for ++ * the subpass). If it is the latter, then we cannot use this feature. ++ */ ++ if (idx >= ARRAY_SIZE(tile_sizes) / 2) { ++ idx--; ++ } else if (idx > 0) { ++ /* Depth is always 32bpp (4x32bpp for 4x MSAA) */ ++ uint32_t depth_bpp = !msaa ? 4 : 16; ++ uint32_t tile_w = tile_sizes[(idx - 1) * 2]; ++ uint32_t tile_h = tile_sizes[(idx - 1) * 2 + 1]; ++ if (tile_w * tile_h * depth_bpp <= 8192) ++ idx--; ++ } ++ } ++ + assert(idx < ARRAY_SIZE(tile_sizes) / 2); + + *width = tile_sizes[idx * 2]; +diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h +index 864fc949ffa..5a7e244a0a5 100644 +--- a/src/broadcom/common/v3d_util.h ++++ b/src/broadcom/common/v3d_util.h +@@ -37,8 +37,10 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, + uint32_t wg_size); + + void +-v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp, +- bool msaa, bool double_buffer, ++v3d_choose_tile_size(const struct v3d_device_info *devinfo, ++ uint32_t color_attachment_count, ++ uint32_t max_color_bpp, bool msaa, ++ bool double_buffer, + uint32_t *width, uint32_t *height); + + uint32_t +diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c +index a8cf02dd386..531e85a1212 100644 +--- a/src/broadcom/compiler/nir_to_vir.c ++++ b/src/broadcom/compiler/nir_to_vir.c +@@ -2483,15 +2483,17 @@ ntq_setup_outputs(struct v3d_compile *c) + + switch (var->data.location) { + case FRAG_RESULT_COLOR: +- c->output_color_var[0] = var; +- c->output_color_var[1] = var; +- c->output_color_var[2] = var; +- c->output_color_var[3] = var; ++ for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) ++ c->output_color_var[i] = var; + break; + case FRAG_RESULT_DATA0: + case FRAG_RESULT_DATA1: + case FRAG_RESULT_DATA2: + case FRAG_RESULT_DATA3: ++ case FRAG_RESULT_DATA4: ++ case FRAG_RESULT_DATA5: ++ case FRAG_RESULT_DATA6: ++ case FRAG_RESULT_DATA7: + c->output_color_var[var->data.location - + FRAG_RESULT_DATA0] = var; + break; +diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c +index bda0a614523..11d161b19b7 100644 +--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c +@@ -365,7 +365,8 @@ job_compute_frame_tiling(struct v3dv_job *job, + /* Double-buffer is incompatible with MSAA */ + assert(!tiling->msaa || !tiling->double_buffer); + +- v3d_choose_tile_size(render_target_count, max_internal_bpp, ++ v3d_choose_tile_size(&job->device->devinfo, ++ render_target_count, max_internal_bpp, + tiling->msaa, tiling->double_buffer, + &tiling->tile_width, &tiling->tile_height); + +@@ -1374,7 +1375,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer) + } + + uint32_t att_count = 0; +- VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */ ++ VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */ + + /* We only need to emit subpass clears as draw calls for color attachments + * if the render area is not aligned to tile boundaries. +diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c +index 01e2dd7ac2d..19e58542414 100644 +--- a/src/broadcom/vulkan/v3dv_device.c ++++ b/src/broadcom/vulkan/v3dv_device.c +@@ -1366,6 +1366,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, + const VkSampleCountFlags supported_sample_counts = + VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT; + ++ const uint8_t max_rts = V3D_MAX_RENDER_TARGETS(pdevice->devinfo.ver); ++ + struct timespec clock_res; + clock_getres(CLOCK_MONOTONIC, &clock_res); + const float timestamp_period = +@@ -1436,7 +1438,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, + .maxFragmentInputComponents = max_varying_components, + .maxFragmentOutputAttachments = 4, + .maxFragmentDualSrcAttachments = 0, +- .maxFragmentCombinedOutputResources = MAX_RENDER_TARGETS + ++ .maxFragmentCombinedOutputResources = max_rts + + MAX_STORAGE_BUFFERS + + MAX_STORAGE_IMAGES, + +@@ -1476,7 +1478,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, + .framebufferDepthSampleCounts = supported_sample_counts, + .framebufferStencilSampleCounts = supported_sample_counts, + .framebufferNoAttachmentsSampleCounts = supported_sample_counts, +- .maxColorAttachments = MAX_RENDER_TARGETS, ++ .maxColorAttachments = max_rts, + .sampledImageColorSampleCounts = supported_sample_counts, + .sampledImageIntegerSampleCounts = supported_sample_counts, + .sampledImageDepthSampleCounts = supported_sample_counts, +diff --git a/src/broadcom/vulkan/v3dv_limits.h b/src/broadcom/vulkan/v3dv_limits.h +index 9cda9f0d6d2..8ac99724105 100644 +--- a/src/broadcom/vulkan/v3dv_limits.h ++++ b/src/broadcom/vulkan/v3dv_limits.h +@@ -50,8 +50,6 @@ + #define MAX_DYNAMIC_BUFFERS (MAX_DYNAMIC_UNIFORM_BUFFERS + \ + MAX_DYNAMIC_STORAGE_BUFFERS) + +-#define MAX_RENDER_TARGETS 4 +- + #define MAX_MULTIVIEW_VIEW_COUNT 16 + + /* These are tunable parameters in the HW design, but all the V3D +diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c +index d376c179e1c..0a7905b49d5 100644 +--- a/src/broadcom/vulkan/v3dv_meta_clear.c ++++ b/src/broadcom/vulkan/v3dv_meta_clear.c +@@ -747,7 +747,7 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx, + uint32_t bit_offset = 0; + + key |= rt_idx; +- bit_offset += 2; ++ bit_offset += 3; + + key |= ((uint64_t) format) << bit_offset; + bit_offset += 32; +@@ -1189,9 +1189,11 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer, + { + V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + +- /* We can only clear attachments in the current subpass */ +- assert(attachmentCount <= 5); /* 4 color + D/S */ ++ /* We can have at most max_color_RTs + 1 D/S attachments */ ++ assert(attachmentCount <= ++ V3D_MAX_RENDER_TARGETS(cmd_buffer->device->devinfo.ver) + 1); + ++ /* We can only clear attachments in the current subpass */ + struct v3dv_render_pass *pass = cmd_buffer->state.pass; + + assert(cmd_buffer->state.subpass_idx < pass->subpass_count); +diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c +index 3e82c15df88..7f2e2bbc710 100644 +--- a/src/broadcom/vulkan/v3dv_pass.c ++++ b/src/broadcom/vulkan/v3dv_pass.c +@@ -322,11 +322,11 @@ subpass_get_granularity(struct v3dv_device *device, + /* Granularity is defined by the tile size */ + assert(subpass_idx < pass->subpass_count); + struct v3dv_subpass *subpass = &pass->subpasses[subpass_idx]; +- const uint32_t color_attachment_count = subpass->color_count; ++ const uint32_t color_count = subpass->color_count; + + bool msaa = false; + uint32_t max_bpp = 0; +- for (uint32_t i = 0; i < color_attachment_count; i++) { ++ for (uint32_t i = 0; i < color_count; i++) { + uint32_t attachment_idx = subpass->color_attachments[i].attachment; + if (attachment_idx == VK_ATTACHMENT_UNUSED) + continue; +@@ -349,7 +349,7 @@ subpass_get_granularity(struct v3dv_device *device, + * heuristics so we choose a conservative granularity here, with it disabled. + */ + uint32_t width, height; +- v3d_choose_tile_size(color_attachment_count, max_bpp, msaa, ++ v3d_choose_tile_size(&device->devinfo, color_count, max_bpp, msaa, + false /* double-buffer */, &width, &height); + *granularity = (VkExtent2D) { + .width = width, +diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c +index 2156176d4cc..3bcdcc9a853 100644 +--- a/src/broadcom/vulkan/v3dv_pipeline.c ++++ b/src/broadcom/vulkan/v3dv_pipeline.c +@@ -2632,6 +2632,7 @@ pipeline_init_dynamic_state( + const VkPipelineColorWriteCreateInfoEXT *pColorWriteState) + { + /* Initialize to default values */ ++ const struct v3d_device_info *devinfo = &pipeline->device->devinfo; + struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state; + memset(dynamic, 0, sizeof(*dynamic)); + dynamic->stencil_compare_mask.front = ~0; +@@ -2639,7 +2640,8 @@ pipeline_init_dynamic_state( + dynamic->stencil_write_mask.front = ~0; + dynamic->stencil_write_mask.back = ~0; + dynamic->line_width = 1.0f; +- dynamic->color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1; ++ dynamic->color_write_enable = ++ (1ull << (4 * V3D_MAX_RENDER_TARGETS(devinfo->ver))) - 1; + + /* Create a mask of enabled dynamic states */ + uint32_t dynamic_states = 0; +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index 3566649aafd..bf5e47018e8 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -1550,10 +1550,13 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer) + struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + assert(pipeline); + ++ const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo; ++ const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver); ++ + const uint32_t blend_packets_size = + cl_packet_length(BLEND_ENABLES) + + cl_packet_length(BLEND_CONSTANT_COLOR) + +- cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS; ++ cl_packet_length(BLEND_CFG) * max_color_rts; + + v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size); + v3dv_return_if_oom(cmd_buffer, NULL); +@@ -1565,7 +1568,7 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer) + } + } + +- for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) { ++ for (uint32_t i = 0; i < max_color_rts; i++) { + if (pipeline->blend.enables & (1 << i)) + cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]); + } +diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c +index 72daefadb08..4d17a2691a5 100644 +--- a/src/broadcom/vulkan/v3dvx_device.c ++++ b/src/broadcom/vulkan/v3dvx_device.c +@@ -49,7 +49,6 @@ vk_to_v3d_compare_func[] = { + [VK_COMPARE_OP_ALWAYS] = V3D_COMPARE_FUNC_ALWAYS, + }; + +- + static union pipe_color_union encode_border_color( + const VkSamplerCustomBorderColorCreateInfoEXT *bc_info) + { +diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c +index 96179f654a4..51ddc292ff7 100644 +--- a/src/gallium/drivers/v3d/v3d_blit.c ++++ b/src/gallium/drivers/v3d/v3d_blit.c +@@ -369,7 +369,7 @@ v3d_tlb_blit(struct pipe_context *pctx, struct pipe_blit_info *info) + bool double_buffer = V3D_DBG(DOUBLE_BUFFER) && !msaa; + + uint32_t tile_width, tile_height, max_bpp; +- v3d_get_tile_buffer_size(msaa, double_buffer, ++ v3d_get_tile_buffer_size(devinfo, msaa, double_buffer, + is_color_blit ? 1 : 0, surfaces, src_surf, + &tile_width, &tile_height, &max_bpp); + +diff --git a/src/gallium/drivers/v3d/v3d_context.c b/src/gallium/drivers/v3d/v3d_context.c +index f12e8c92139..def546e9ef5 100644 +--- a/src/gallium/drivers/v3d/v3d_context.c ++++ b/src/gallium/drivers/v3d/v3d_context.c +@@ -220,7 +220,8 @@ v3d_flag_dirty_sampler_state(struct v3d_context *v3d, + } + + void +-v3d_get_tile_buffer_size(bool is_msaa, ++v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo, ++ bool is_msaa, + bool double_buffer, + uint32_t nr_cbufs, + struct pipe_surface **cbufs, +@@ -247,7 +248,7 @@ v3d_get_tile_buffer_size(bool is_msaa, + *max_bpp = MAX2(*max_bpp, bsurf->internal_bpp); + } + +- v3d_choose_tile_size(max_cbuf_idx + 1, *max_bpp, ++ v3d_choose_tile_size(devinfo, max_cbuf_idx + 1, *max_bpp, + is_msaa, double_buffer, + tile_width, tile_height); + } +diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h +index 21ee10a90cc..eb184b4b203 100644 +--- a/src/gallium/drivers/v3d/v3d_context.h ++++ b/src/gallium/drivers/v3d/v3d_context.h +@@ -795,7 +795,8 @@ void v3d_ensure_prim_counts_allocated(struct v3d_context *ctx); + void v3d_flag_dirty_sampler_state(struct v3d_context *v3d, + enum pipe_shader_type shader); + +-void v3d_get_tile_buffer_size(bool is_msaa, ++void v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo, ++ bool is_msaa, + bool double_buffer, + uint32_t nr_cbufs, + struct pipe_surface **cbufs, +diff --git a/src/gallium/drivers/v3d/v3d_job.c b/src/gallium/drivers/v3d/v3d_job.c +index b022ed45073..577890a06c3 100644 +--- a/src/gallium/drivers/v3d/v3d_job.c ++++ b/src/gallium/drivers/v3d/v3d_job.c +@@ -383,9 +383,11 @@ v3d_get_job_for_fbo(struct v3d_context *v3d) + job->double_buffer = false; + } + +- v3d_get_tile_buffer_size(job->msaa, job->double_buffer, ++ v3d_get_tile_buffer_size(&v3d->screen->devinfo, ++ job->msaa, job->double_buffer, + job->nr_cbufs, job->cbufs, job->bbuf, +- &job->tile_width, &job->tile_height, ++ &job->tile_width, ++ &job->tile_height, + &job->internal_bpp); + + /* The dirty flags are tracking what's been updated while v3d->job has +diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c +index efdb7d615ae..2225edf85bd 100644 +--- a/src/gallium/drivers/v3d/v3d_screen.c ++++ b/src/gallium/drivers/v3d/v3d_screen.c +@@ -255,9 +255,8 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) + case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: + return V3D_MAX_ARRAY_LAYERS; + +- /* Render targets. */ + case PIPE_CAP_MAX_RENDER_TARGETS: +- return 4; ++ return V3D_MAX_RENDER_TARGETS(screen->devinfo.ver); + + case PIPE_CAP_VENDOR_ID: + return 0x14E4; +diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c +index 75751dc9ab6..87e75281dc9 100644 +--- a/src/gallium/drivers/v3d/v3dx_emit.c ++++ b/src/gallium/drivers/v3d/v3dx_emit.c +@@ -661,8 +661,10 @@ v3dX(emit_state)(struct pipe_context *pctx) + } + #endif + ++ const uint32_t max_rts = ++ V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver); + if (blend->base.independent_blend_enable) { +- for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) ++ for (int i = 0; i < max_rts; i++) + emit_rt_blend(v3d, job, &blend->base, i, + (1 << i), + v3d->blend_dst_alpha_one & (1 << i)); +@@ -678,16 +680,16 @@ v3dX(emit_state)(struct pipe_context *pctx) + * RTs without. + */ + emit_rt_blend(v3d, job, &blend->base, 0, +- ((1 << V3D_MAX_DRAW_BUFFERS) - 1) & ++ ((1 << max_rts) - 1) & + v3d->blend_dst_alpha_one, + true); + emit_rt_blend(v3d, job, &blend->base, 0, +- ((1 << V3D_MAX_DRAW_BUFFERS) - 1) & ++ ((1 << max_rts) - 1) & + ~v3d->blend_dst_alpha_one, + false); + } else { + emit_rt_blend(v3d, job, &blend->base, 0, +- (1 << V3D_MAX_DRAW_BUFFERS) - 1, ++ (1 << max_rts) - 1, + v3d->blend_dst_alpha_one); + } + } +@@ -696,8 +698,10 @@ v3dX(emit_state)(struct pipe_context *pctx) + if (v3d->dirty & V3D_DIRTY_BLEND) { + struct pipe_blend_state *blend = &v3d->blend->base; + ++ const uint32_t max_rts = ++ V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver); + cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) { +- for (int i = 0; i < 4; i++) { ++ for (int i = 0; i < max_rts; i++) { + int rt = blend->independent_blend_enable ? i : 0; + int rt_mask = blend->rt[rt].colormask; + +diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c +index 88e57cd072b..970a082aa85 100644 +--- a/src/gallium/drivers/v3d/v3dx_state.c ++++ b/src/gallium/drivers/v3d/v3dx_state.c +@@ -138,8 +138,9 @@ v3d_create_blend_state(struct pipe_context *pctx, + + so->base = *cso; + ++ uint32_t max_rts = V3D_MAX_RENDER_TARGETS(V3D_VERSION); + if (cso->independent_blend_enable) { +- for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) { ++ for (int i = 0; i < max_rts; i++) { + so->blend_enables |= cso->rt[i].blend_enable << i; + + /* V3D 4.x is when we got independent blend enables. */ +@@ -148,7 +149,7 @@ v3d_create_blend_state(struct pipe_context *pctx, + } + } else { + if (cso->rt[0].blend_enable) +- so->blend_enables = (1 << V3D_MAX_DRAW_BUFFERS) - 1; ++ so->blend_enables = (1 << max_rts) - 1; + } + + return so; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0124-v3d-v3dv-don-t-use-max-internal-bpp-for-tile-sizing-.patch b/projects/RPi/devices/RPi5/patches/mesa/0124-v3d-v3dv-don-t-use-max-internal-bpp-for-tile-sizing-.patch new file mode 100644 index 0000000000..2e193e0644 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0124-v3d-v3dv-don-t-use-max-internal-bpp-for-tile-sizing-.patch @@ -0,0 +1,539 @@ +From cc5afd808039f3e0b81fe0615745b74cbb31d0bf Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 16 Nov 2021 11:26:17 +0100 +Subject: [PATCH 124/142] v3d,v3dv: don't use max internal bpp for tile sizing + in V3D 7.x + +We can use the actual bpp of each color attachment to compute real +tile memory requirements, which may allow us to choose a larger tile +size configuration than in V3D 4.2 in certain scenarios. +--- + src/broadcom/common/v3d_util.c | 112 +++++++++++++++--------- + src/broadcom/common/v3d_util.h | 7 +- + src/broadcom/vulkan/v3dv_cmd_buffer.c | 20 +++-- + src/broadcom/vulkan/v3dv_meta_clear.c | 1 + + src/broadcom/vulkan/v3dv_meta_copy.c | 19 ++-- + src/broadcom/vulkan/v3dv_pass.c | 9 +- + src/broadcom/vulkan/v3dv_private.h | 2 + + src/broadcom/vulkan/v3dvx_device.c | 21 +++-- + src/broadcom/vulkan/v3dvx_meta_common.c | 10 ++- + src/broadcom/vulkan/v3dvx_private.h | 4 +- + src/broadcom/vulkan/v3dvx_queue.c | 3 +- + src/gallium/drivers/v3d/v3d_context.c | 6 +- + 12 files changed, 140 insertions(+), 74 deletions(-) + +diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c +index 209a5eceaa1..8a50d279985 100644 +--- a/src/broadcom/common/v3d_util.c ++++ b/src/broadcom/common/v3d_util.c +@@ -87,12 +87,37 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, + return best_wgs_per_sg; + } + ++#define V3D71_TLB_COLOR_SIZE (16 * 1024) ++#define V3D71_TLB_DETPH_SIZE (16 * 1024) ++#define V3D71_TLB_AUX_DETPH_SIZE (8 * 1024) ++ ++static bool ++tile_size_valid(uint32_t pixel_count, uint32_t color_bpp, uint32_t depth_bpp) ++{ ++ /* First, we check if we can fit this tile size allocating the depth ++ * TLB memory to color. ++ */ ++ if (pixel_count * depth_bpp <= V3D71_TLB_AUX_DETPH_SIZE && ++ pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE + V3D71_TLB_DETPH_SIZE) { ++ return true; ++ } ++ ++ /* Otherwise the tile must fit in the main TLB buffers */ ++ return pixel_count * depth_bpp <= V3D71_TLB_DETPH_SIZE && ++ pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE; ++} ++ + void + v3d_choose_tile_size(const struct v3d_device_info *devinfo, + uint32_t color_attachment_count, +- uint32_t max_color_bpp, bool msaa, ++ /* V3D 4.x max internal bpp of all RTs */ ++ uint32_t max_internal_bpp, ++ /* V3D 7.x accumulated bpp for all RTs (in bytes) */ ++ uint32_t total_color_bpp, ++ bool msaa, + bool double_buffer, +- uint32_t *width, uint32_t *height) ++ uint32_t *width, ++ uint32_t *height) + { + static const uint8_t tile_sizes[] = { + 64, 64, +@@ -105,37 +130,19 @@ v3d_choose_tile_size(const struct v3d_device_info *devinfo, + }; + + uint32_t idx = 0; +- if (color_attachment_count > 4) +- idx += 3; +- else if (color_attachment_count > 2) +- idx += 2; +- else if (color_attachment_count > 1) +- idx += 1; +- +- /* MSAA and double-buffer are mutually exclusive */ +- assert(!msaa || !double_buffer); +- if (msaa) +- idx += 2; +- else if (double_buffer) +- idx += 1; +- +- idx += max_color_bpp; +- + if (devinfo->ver >= 71) { +- /* In V3D 7.x the TLB has an auxiliary buffer of 8KB that will be +- * automatically used for depth instead of the main 16KB depth TLB buffer +- * when the depth tile fits in the auxiliary buffer, allowing the hardware +- * to allocate the 16KB from the main depth TLB to the color TLB. If +- * we can do that, then we are effectively doubling the memory we have +- * for color and we can increase our tile dimensions by a factor of 2 +- * (reduce idx by 1). ++ /* In V3D 7.x, we use the actual bpp used by color attachments to compute ++ * the tile size instead of the maximum bpp. This may allow us to choose a ++ * larger tile size than we would in 4.x in scenarios with multiple RTs ++ * with different bpps. + * +- * If we have computed a tile size that would be smaller than the minimum +- * of 8x8, then it is certain that depth will fit in the aux depth TLB +- * (even in MSAA mode). +- * +- * Otherwise, we need check if we can fit depth in the aux TLB buffer +- * using a larger tile size. ++ * Also, the TLB has an auxiliary buffer of 8KB that will be automatically ++ * used for depth instead of the main 16KB depth TLB buffer when the depth ++ * tile fits in the auxiliary buffer, allowing the hardware to allocate ++ * the 16KB from the main depth TLB to the color TLB. If we can do that, ++ * then we are effectively doubling the memory we have for color and we ++ * can also select a larger tile size. This is necessary to support ++ * the most expensive configuration: 8x128bpp RTs + MSAA. + * + * FIXME: the docs state that depth TLB memory can be used for color + * if depth testing is not used by setting the 'depth disable' bit in the +@@ -147,17 +154,40 @@ v3d_choose_tile_size(const struct v3d_device_info *devinfo, + * configuration item) or active in the subpass for which we are enabling + * the bit (which we can't tell until later, when we record commands for + * the subpass). If it is the latter, then we cannot use this feature. ++ * ++ * FIXME: pending handling double_buffer. + */ +- if (idx >= ARRAY_SIZE(tile_sizes) / 2) { +- idx--; +- } else if (idx > 0) { +- /* Depth is always 32bpp (4x32bpp for 4x MSAA) */ +- uint32_t depth_bpp = !msaa ? 4 : 16; +- uint32_t tile_w = tile_sizes[(idx - 1) * 2]; +- uint32_t tile_h = tile_sizes[(idx - 1) * 2 + 1]; +- if (tile_w * tile_h * depth_bpp <= 8192) +- idx--; +- } ++ const uint32_t color_bpp = total_color_bpp * (msaa ? 4 : 1); ++ const uint32_t depth_bpp = 4 * (msaa ? 4 : 1); ++ do { ++ const uint32_t tile_w = tile_sizes[idx * 2]; ++ const uint32_t tile_h = tile_sizes[idx * 2 + 1]; ++ if (tile_size_valid(tile_w * tile_h, color_bpp, depth_bpp)) ++ break; ++ idx++; ++ } while (idx < ARRAY_SIZE(tile_sizes) / 2); ++ ++ /* FIXME: pending handling double_buffer */ ++ assert(!double_buffer); ++ } else { ++ /* On V3D 4.x tile size is selected based on the number of RTs, the ++ * maximum bpp across all of them and whether 4x MSAA is used. ++ */ ++ if (color_attachment_count > 4) ++ idx += 3; ++ else if (color_attachment_count > 2) ++ idx += 2; ++ else if (color_attachment_count > 1) ++ idx += 1; ++ ++ /* MSAA and double-buffer are mutually exclusive */ ++ assert(!msaa || !double_buffer); ++ if (msaa) ++ idx += 2; ++ else if (double_buffer) ++ idx += 1; ++ ++ idx += max_internal_bpp; + } + + assert(idx < ARRAY_SIZE(tile_sizes) / 2); +diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h +index 5a7e244a0a5..d02d41dd089 100644 +--- a/src/broadcom/common/v3d_util.h ++++ b/src/broadcom/common/v3d_util.h +@@ -39,9 +39,12 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, + void + v3d_choose_tile_size(const struct v3d_device_info *devinfo, + uint32_t color_attachment_count, +- uint32_t max_color_bpp, bool msaa, ++ uint32_t max_internal_bpp, ++ uint32_t total_color_bpp, ++ bool msaa, + bool double_buffer, +- uint32_t *width, uint32_t *height); ++ uint32_t *width, ++ uint32_t *height); + + uint32_t + v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle); +diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c +index 11d161b19b7..f65388c10ec 100644 +--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c +@@ -348,6 +348,7 @@ job_compute_frame_tiling(struct v3dv_job *job, + uint32_t layers, + uint32_t render_target_count, + uint8_t max_internal_bpp, ++ uint8_t total_color_bpp, + bool msaa, + bool double_buffer) + { +@@ -360,14 +361,16 @@ job_compute_frame_tiling(struct v3dv_job *job, + tiling->render_target_count = render_target_count; + tiling->msaa = msaa; + tiling->internal_bpp = max_internal_bpp; ++ tiling->total_color_bpp = total_color_bpp; + tiling->double_buffer = double_buffer; + + /* Double-buffer is incompatible with MSAA */ + assert(!tiling->msaa || !tiling->double_buffer); + + v3d_choose_tile_size(&job->device->devinfo, +- render_target_count, max_internal_bpp, +- tiling->msaa, tiling->double_buffer, ++ render_target_count, ++ max_internal_bpp, total_color_bpp, msaa, ++ tiling->double_buffer, + &tiling->tile_width, &tiling->tile_height); + + tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width); +@@ -458,6 +461,7 @@ v3dv_job_start_frame(struct v3dv_job *job, + bool allocate_tile_state_now, + uint32_t render_target_count, + uint8_t max_internal_bpp, ++ uint8_t total_color_bpp, + bool msaa) + { + assert(job); +@@ -468,7 +472,7 @@ v3dv_job_start_frame(struct v3dv_job *job, + const struct v3dv_frame_tiling *tiling = + job_compute_frame_tiling(job, width, height, layers, + render_target_count, max_internal_bpp, +- msaa, false); ++ total_color_bpp, msaa, false); + + v3dv_cl_ensure_space_with_branch(&job->bcl, 256); + v3dv_return_if_oom(NULL, job); +@@ -529,6 +533,7 @@ cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer) + job->frame_tiling.layers, + job->frame_tiling.render_target_count, + job->frame_tiling.internal_bpp, ++ job->frame_tiling.total_color_bpp, + job->frame_tiling.msaa, + true); + +@@ -1673,10 +1678,11 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer, + + const struct v3dv_framebuffer *framebuffer = state->framebuffer; + +- uint8_t internal_bpp; ++ uint8_t max_internal_bpp, total_color_bpp; + bool msaa; + v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa) +- (framebuffer, state->attachments, subpass, &internal_bpp, &msaa); ++ (framebuffer, state->attachments, subpass, ++ &max_internal_bpp, &total_color_bpp, &msaa); + + /* From the Vulkan spec: + * +@@ -1700,7 +1706,8 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer, + layers, + true, false, + subpass->color_count, +- internal_bpp, ++ max_internal_bpp, ++ total_color_bpp, + msaa); + } + +@@ -2668,6 +2675,7 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer) + true, false, + old_job->frame_tiling.render_target_count, + old_job->frame_tiling.internal_bpp, ++ old_job->frame_tiling.total_color_bpp, + true /* msaa */); + + v3dv_job_destroy(old_job); +diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c +index 0a7905b49d5..1c0d66c977c 100644 +--- a/src/broadcom/vulkan/v3dv_meta_clear.c ++++ b/src/broadcom/vulkan/v3dv_meta_clear.c +@@ -127,6 +127,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, + + v3dv_job_start_frame(job, width, height, max_layer, + false, true, 1, internal_bpp, ++ 4 * v3d_internal_bpp_words(internal_bpp), + image->vk.samples > VK_SAMPLE_COUNT_1_BIT); + + struct v3dv_meta_framebuffer framebuffer; +diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c +index c0ec888b8c7..2d30c611e17 100644 +--- a/src/broadcom/vulkan/v3dv_meta_copy.c ++++ b/src/broadcom/vulkan/v3dv_meta_copy.c +@@ -453,8 +453,9 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer, + const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w); + const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h); + +- v3dv_job_start_frame(job, width, height, num_layers, false, true, +- 1, internal_bpp, false); ++ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, ++ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), ++ false); + + struct v3dv_meta_framebuffer framebuffer; + v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, +@@ -1323,8 +1324,8 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, + const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w); + const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h); + +- v3dv_job_start_frame(job, width, height, num_layers, +- false, true, 1, internal_bpp, ++ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, ++ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + src->vk.samples > VK_SAMPLE_COUNT_1_BIT); + + struct v3dv_meta_framebuffer framebuffer; +@@ -1978,8 +1979,9 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, + const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w); + const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h); + +- v3dv_job_start_frame(job, width, height, num_layers, false, true, +- 1, internal_bpp, false); ++ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, ++ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), ++ false); + + struct v3dv_meta_framebuffer framebuffer; + v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, +@@ -4884,8 +4886,9 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, + (fb_format, region->srcSubresource.aspectMask, + &internal_type, &internal_bpp); + +- v3dv_job_start_frame(job, width, height, num_layers, false, true, +- 1, internal_bpp, true); ++ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, ++ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), ++ true); + + struct v3dv_meta_framebuffer framebuffer; + v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, +diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c +index 7f2e2bbc710..0583faf6f9a 100644 +--- a/src/broadcom/vulkan/v3dv_pass.c ++++ b/src/broadcom/vulkan/v3dv_pass.c +@@ -325,7 +325,8 @@ subpass_get_granularity(struct v3dv_device *device, + const uint32_t color_count = subpass->color_count; + + bool msaa = false; +- uint32_t max_bpp = 0; ++ uint32_t max_internal_bpp = 0; ++ uint32_t total_color_bpp = 0; + for (uint32_t i = 0; i < color_count; i++) { + uint32_t attachment_idx = subpass->color_attachments[i].attachment; + if (attachment_idx == VK_ATTACHMENT_UNUSED) +@@ -339,7 +340,8 @@ subpass_get_granularity(struct v3dv_device *device, + v3dv_X(device, get_internal_type_bpp_for_output_format) + (format->planes[0].rt_type, &internal_type, &internal_bpp); + +- max_bpp = MAX2(max_bpp, internal_bpp); ++ max_internal_bpp = MAX2(max_internal_bpp, internal_bpp); ++ total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); + + if (desc->samples > VK_SAMPLE_COUNT_1_BIT) + msaa = true; +@@ -349,7 +351,8 @@ subpass_get_granularity(struct v3dv_device *device, + * heuristics so we choose a conservative granularity here, with it disabled. + */ + uint32_t width, height; +- v3d_choose_tile_size(&device->devinfo, color_count, max_bpp, msaa, ++ v3d_choose_tile_size(&device->devinfo, color_count, ++ max_internal_bpp, total_color_bpp, msaa, + false /* double-buffer */, &width, &height); + *granularity = (VkExtent2D) { + .width = width, +diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h +index 300a1ec8ae1..9375cdd58c0 100644 +--- a/src/broadcom/vulkan/v3dv_private.h ++++ b/src/broadcom/vulkan/v3dv_private.h +@@ -950,6 +950,7 @@ struct v3dv_frame_tiling { + uint32_t layers; + uint32_t render_target_count; + uint32_t internal_bpp; ++ uint32_t total_color_bpp; + bool msaa; + bool double_buffer; + uint32_t tile_width; +@@ -1373,6 +1374,7 @@ void v3dv_job_start_frame(struct v3dv_job *job, + bool allocate_tile_state_now, + uint32_t render_target_count, + uint8_t max_internal_bpp, ++ uint8_t total_color_bpp, + bool msaa); + + bool v3dv_job_type_is_gpu(struct v3dv_job *job); +diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c +index 4d17a2691a5..61ad98c1217 100644 +--- a/src/broadcom/vulkan/v3dvx_device.c ++++ b/src/broadcom/vulkan/v3dvx_device.c +@@ -257,11 +257,13 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( + const struct v3dv_framebuffer *framebuffer, + const struct v3dv_cmd_buffer_attachment_state *attachments, + const struct v3dv_subpass *subpass, +- uint8_t *max_bpp, ++ uint8_t *max_internal_bpp, ++ uint8_t *total_color_bpp, + bool *msaa) + { + STATIC_ASSERT(V3D_INTERNAL_BPP_32 == 0); +- *max_bpp = V3D_INTERNAL_BPP_32; ++ *max_internal_bpp = V3D_INTERNAL_BPP_32; ++ *total_color_bpp = 0; + *msaa = false; + + if (subpass) { +@@ -274,8 +276,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( + assert(att); + assert(att->plane_count == 1); + +- if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) +- *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp); ++ if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) { ++ const uint32_t internal_bpp = att->planes[0].internal_bpp; ++ *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp); ++ *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); ++ } + + if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) + *msaa = true; +@@ -289,7 +294,6 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( + if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) + *msaa = true; + } +- + return; + } + +@@ -299,8 +303,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( + assert(att); + assert(att->plane_count == 1); + +- if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) +- *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp); ++ if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) { ++ const uint32_t internal_bpp = att->planes[0].internal_bpp; ++ *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp); ++ *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); ++ } + + if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) + *msaa = true; +diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c +index b8f3297bc94..858096f9e4b 100644 +--- a/src/broadcom/vulkan/v3dvx_meta_common.c ++++ b/src/broadcom/vulkan/v3dvx_meta_common.c +@@ -1408,8 +1408,9 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer, + uint32_t width, height; + framebuffer_size_for_pixel_count(num_items, &width, &height); + +- v3dv_job_start_frame(job, width, height, 1, true, true, +- 1, internal_bpp, false); ++ v3dv_job_start_frame(job, width, height, 1, true, true, 1, ++ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), ++ false); + + struct v3dv_meta_framebuffer framebuffer; + v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type, +@@ -1455,8 +1456,9 @@ v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer, + uint32_t width, height; + framebuffer_size_for_pixel_count(num_items, &width, &height); + +- v3dv_job_start_frame(job, width, height, 1, true, true, +- 1, internal_bpp, false); ++ v3dv_job_start_frame(job, width, height, 1, true, true, 1, ++ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), ++ false); + + struct v3dv_meta_framebuffer framebuffer; + v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT, +diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h +index 81715520913..709b129926f 100644 +--- a/src/broadcom/vulkan/v3dvx_private.h ++++ b/src/broadcom/vulkan/v3dvx_private.h +@@ -136,7 +136,9 @@ void + v3dX(framebuffer_compute_internal_bpp_msaa)(const struct v3dv_framebuffer *framebuffer, + const struct v3dv_cmd_buffer_attachment_state *attachments, + const struct v3dv_subpass *subpass, +- uint8_t *max_bpp, bool *msaa); ++ uint8_t *max_internal_bpp, ++ uint8_t *total_color_bpp, ++ bool *msaa); + + #ifdef DEBUG + void +diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c +index f8cee36e3bf..6eed2de9d54 100644 +--- a/src/broadcom/vulkan/v3dvx_queue.c ++++ b/src/broadcom/vulkan/v3dvx_queue.c +@@ -29,7 +29,8 @@ + void + v3dX(job_emit_noop)(struct v3dv_job *job) + { +- v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, V3D_INTERNAL_BPP_32, false); ++ v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, ++ V3D_INTERNAL_BPP_32, 4, false); + v3dX(job_emit_binning_flush)(job); + + struct v3dv_cl *rcl = &job->rcl; +diff --git a/src/gallium/drivers/v3d/v3d_context.c b/src/gallium/drivers/v3d/v3d_context.c +index def546e9ef5..1dc4bd017fe 100644 +--- a/src/gallium/drivers/v3d/v3d_context.c ++++ b/src/gallium/drivers/v3d/v3d_context.c +@@ -233,11 +233,13 @@ v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo, + assert(!is_msaa || !double_buffer); + + uint32_t max_cbuf_idx = 0; ++ uint32_t total_bpp = 0; + *max_bpp = 0; + for (int i = 0; i < nr_cbufs; i++) { + if (cbufs[i]) { + struct v3d_surface *surf = v3d_surface(cbufs[i]); + *max_bpp = MAX2(*max_bpp, surf->internal_bpp); ++ total_bpp += 4 * v3d_internal_bpp_words(surf->internal_bpp); + max_cbuf_idx = MAX2(i, max_cbuf_idx); + } + } +@@ -246,9 +248,11 @@ v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo, + struct v3d_surface *bsurf = v3d_surface(bbuf); + assert(bbuf->texture->nr_samples <= 1 || is_msaa); + *max_bpp = MAX2(*max_bpp, bsurf->internal_bpp); ++ total_bpp += 4 * v3d_internal_bpp_words(bsurf->internal_bpp); + } + +- v3d_choose_tile_size(devinfo, max_cbuf_idx + 1, *max_bpp, ++ v3d_choose_tile_size(devinfo, max_cbuf_idx + 1, ++ *max_bpp, total_bpp, + is_msaa, double_buffer, + tile_width, tile_height); + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0125-v3dv-implement-depthBounds-support-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0125-v3dv-implement-depthBounds-support-for-v71.patch new file mode 100644 index 0000000000..c03e043b90 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0125-v3dv-implement-depthBounds-support-for-v71.patch @@ -0,0 +1,241 @@ +From 210338b6b1b030d36acaebad504ed2bec4a2cd74 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Fri, 19 Nov 2021 10:51:37 +0100 +Subject: [PATCH 125/142] v3dv: implement depthBounds support for v71 + +Just for for v71, as that feature is not supported by older hw. +--- + src/broadcom/vulkan/v3dv_cmd_buffer.c | 19 ++++++++++++--- + src/broadcom/vulkan/v3dv_device.c | 2 +- + src/broadcom/vulkan/v3dv_pipeline.c | 17 ++++++++------ + src/broadcom/vulkan/v3dv_private.h | 12 +++++++++- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 32 ++++++++++++++++++++++++++ + src/broadcom/vulkan/v3dvx_pipeline.c | 3 +++ + src/broadcom/vulkan/v3dvx_private.h | 3 +++ + 7 files changed, 76 insertions(+), 12 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c +index f65388c10ec..36bd7960985 100644 +--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c +@@ -2070,6 +2070,14 @@ cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer, + } + } + ++ if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BOUNDS)) { ++ if (memcmp(&dest->depth_bounds, &src->depth_bounds, ++ sizeof(src->depth_bounds))) { ++ memcpy(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds)); ++ dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS; ++ } ++ } ++ + if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) { + if (dest->line_width != src->line_width) { + dest->line_width = src->line_width; +@@ -2940,6 +2948,9 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer, + if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS)) + v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer); + ++ if (*dirty & V3DV_CMD_DIRTY_DEPTH_BOUNDS) ++ v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer); ++ + if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS)) + v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer); + +@@ -3369,9 +3380,11 @@ v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, + float minDepthBounds, + float maxDepthBounds) + { +- /* We do not support depth bounds testing so we just ignore this. We are +- * already asserting that pipelines don't enable the feature anyway. +- */ ++ V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); ++ ++ cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds; ++ cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds; ++ cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS; + } + + VKAPI_ATTR void VKAPI_CALL +diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c +index 19e58542414..1de9b5ce683 100644 +--- a/src/broadcom/vulkan/v3dv_device.c ++++ b/src/broadcom/vulkan/v3dv_device.c +@@ -227,7 +227,7 @@ get_features(const struct v3dv_physical_device *physical_device, + .depthClamp = false, /* Only available since V3D 4.5.1.1 */ + .depthBiasClamp = true, + .fillModeNonSolid = true, +- .depthBounds = false, /* Only available since V3D 4.3.16.2 */ ++ .depthBounds = physical_device->devinfo.ver >= 71, + .wideLines = true, + .largePoints = true, + .alphaToOne = true, +diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c +index 3bcdcc9a853..ba782b8268a 100644 +--- a/src/broadcom/vulkan/v3dv_pipeline.c ++++ b/src/broadcom/vulkan/v3dv_pipeline.c +@@ -2608,13 +2608,8 @@ v3dv_dynamic_state_mask(VkDynamicState state) + return V3DV_DYNAMIC_LINE_WIDTH; + case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT: + return V3DV_DYNAMIC_COLOR_WRITE_ENABLE; +- +- /* Depth bounds testing is not available in in V3D 4.2 so here we are just +- * ignoring this dynamic state. We are already asserting at pipeline creation +- * time that depth bounds testing is not enabled. +- */ + case VK_DYNAMIC_STATE_DEPTH_BOUNDS: +- return 0; ++ return V3DV_DYNAMIC_DEPTH_BOUNDS; + + default: + unreachable("Unhandled dynamic state"); +@@ -2642,6 +2637,7 @@ pipeline_init_dynamic_state( + dynamic->line_width = 1.0f; + dynamic->color_write_enable = + (1ull << (4 * V3D_MAX_RENDER_TARGETS(devinfo->ver))) - 1; ++ dynamic->depth_bounds.max = 1.0f; + + /* Create a mask of enabled dynamic states */ + uint32_t dynamic_states = 0; +@@ -2694,6 +2690,11 @@ pipeline_init_dynamic_state( + dynamic->stencil_reference.front = pDepthStencilState->front.reference; + dynamic->stencil_reference.back = pDepthStencilState->back.reference; + } ++ ++ if (!(dynamic_states & V3DV_DYNAMIC_DEPTH_BOUNDS)) { ++ dynamic->depth_bounds.min = pDepthStencilState->minDepthBounds; ++ dynamic->depth_bounds.max = pDepthStencilState->maxDepthBounds; ++ } + } + + if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) { +@@ -2907,7 +2908,9 @@ pipeline_init(struct v3dv_pipeline *pipeline, + /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that + * feature and it shouldn't be used by any pipeline. + */ +- assert(!ds_info || !ds_info->depthBoundsTestEnable); ++ assert(device->devinfo.ver >= 71 || ++ !ds_info || !ds_info->depthBoundsTestEnable); ++ pipeline->depth_bounds_test_enabled = ds_info && ds_info->depthBoundsTestEnable; + + enable_depth_bias(pipeline, rs_info); + +diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h +index 9375cdd58c0..a074e0a981c 100644 +--- a/src/broadcom/vulkan/v3dv_private.h ++++ b/src/broadcom/vulkan/v3dv_private.h +@@ -1045,7 +1045,8 @@ enum v3dv_dynamic_state_bits { + V3DV_DYNAMIC_DEPTH_BIAS = 1 << 6, + V3DV_DYNAMIC_LINE_WIDTH = 1 << 7, + V3DV_DYNAMIC_COLOR_WRITE_ENABLE = 1 << 8, +- V3DV_DYNAMIC_ALL = (1 << 9) - 1, ++ V3DV_DYNAMIC_DEPTH_BOUNDS = 1 << 9, ++ V3DV_DYNAMIC_ALL = (1 << 10) - 1, + }; + + /* Flags for dirty pipeline state. +@@ -1070,6 +1071,7 @@ enum v3dv_cmd_dirty_bits { + V3DV_CMD_DIRTY_LINE_WIDTH = 1 << 16, + V3DV_CMD_DIRTY_VIEW_INDEX = 1 << 17, + V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE = 1 << 18, ++ V3DV_CMD_DIRTY_DEPTH_BOUNDS = 1 << 19, + }; + + struct v3dv_dynamic_state { +@@ -1106,6 +1108,11 @@ struct v3dv_dynamic_state { + float slope_factor; + } depth_bias; + ++ struct { ++ float min; ++ float max; ++ } depth_bounds; ++ + float line_width; + + uint32_t color_write_enable; +@@ -2333,6 +2340,9 @@ struct v3dv_pipeline { + bool is_z16; + } depth_bias; + ++ /* Depth bounds */ ++ bool depth_bounds_test_enabled; ++ + struct { + void *mem_ctx; + struct util_dynarray data; /* Array of v3dv_pipeline_executable_data */ +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index bf5e47018e8..9307a6e9d93 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -1507,6 +1507,38 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer) + cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS; + } + ++void ++v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer) ++{ ++ /* No depthBounds support for v42, so this method is empty on that case. ++ * ++ * Note that this method is being called as v3dv_job_init flag all state as ++ * dirty. See FIXME note at v3dv_job_init. ++ */ ++ ++#if V3D_VERSION >= 71 ++ struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; ++ assert(pipeline); ++ ++ if (!pipeline->depth_bounds_test_enabled) ++ return; ++ ++ struct v3dv_job *job = cmd_buffer->state.job; ++ assert(job); ++ ++ v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS)); ++ v3dv_return_if_oom(cmd_buffer, NULL); ++ ++ struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; ++ cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) { ++ bounds.lower_test_limit = dynamic->depth_bounds.min; ++ bounds.upper_test_limit = dynamic->depth_bounds.max; ++ } ++ ++ cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BOUNDS; ++#endif ++} ++ + void + v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer) + { +diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c +index 7b1133f8173..83ab2f19e4f 100644 +--- a/src/broadcom/vulkan/v3dvx_pipeline.c ++++ b/src/broadcom/vulkan/v3dvx_pipeline.c +@@ -259,6 +259,9 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline, + } else { + config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE; + } ++ ++ config.depth_bounds_test_enable = ++ ds_info && ds_info->depthBoundsTestEnable && has_ds_attachment; + #endif + }; + } +diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h +index 709b129926f..1ce4789c5ac 100644 +--- a/src/broadcom/vulkan/v3dvx_private.h ++++ b/src/broadcom/vulkan/v3dvx_private.h +@@ -54,6 +54,9 @@ v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer); + void + v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer); + ++void ++v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer); ++ + void + v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer); + +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0126-v3d-v3dv-propagate-NaNs-bits-in-shader-state-records.patch b/projects/RPi/devices/RPi5/patches/mesa/0126-v3d-v3dv-propagate-NaNs-bits-in-shader-state-records.patch new file mode 100644 index 0000000000..e59c0e1890 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0126-v3d-v3dv-propagate-NaNs-bits-in-shader-state-records.patch @@ -0,0 +1,119 @@ +From be6508ffef8c0e9fbc47175739db80a3eeff2cdb Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Fri, 3 Dec 2021 13:20:22 +0100 +Subject: [PATCH 126/142] v3d,v3dv: propagate NaNs bits in shader state records + are reserved in v7.x + +--- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 4 ++++ + src/broadcom/vulkan/v3dvx_pipeline.c | 10 +++++----- + src/gallium/drivers/v3d/v3dx_draw.c | 14 +++++++++----- + 3 files changed, 18 insertions(+), 10 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index 9307a6e9d93..580aeb8ba2b 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -2175,7 +2175,9 @@ emit_gs_shader_state_record(struct v3dv_job *job, + gs_bin->prog_data.gs->base.threads == 4; + shader.geometry_bin_mode_shader_start_in_final_thread_section = + gs_bin->prog_data.gs->base.single_seg; ++#if V3D_VERSION <= 42 + shader.geometry_bin_mode_shader_propagate_nans = true; ++#endif + shader.geometry_bin_mode_shader_uniforms_address = + gs_bin_uniforms; + +@@ -2185,7 +2187,9 @@ emit_gs_shader_state_record(struct v3dv_job *job, + gs->prog_data.gs->base.threads == 4; + shader.geometry_render_mode_shader_start_in_final_thread_section = + gs->prog_data.gs->base.single_seg; ++#if V3D_VERSION <= 42 + shader.geometry_render_mode_shader_propagate_nans = true; ++#endif + shader.geometry_render_mode_shader_uniforms_address = + gs_render_uniforms; + } +diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c +index 83ab2f19e4f..c9b537f4b32 100644 +--- a/src/broadcom/vulkan/v3dvx_pipeline.c ++++ b/src/broadcom/vulkan/v3dvx_pipeline.c +@@ -471,19 +471,19 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) + shader.number_of_varyings_in_fragment_shader = + prog_data_fs->num_inputs; + +- shader.coordinate_shader_propagate_nans = true; +- shader.vertex_shader_propagate_nans = true; +- shader.fragment_shader_propagate_nans = true; +- + /* Note: see previous note about addresses */ + /* shader.coordinate_shader_code_address */ + /* shader.vertex_shader_code_address */ + /* shader.fragment_shader_code_address */ + ++#if V3D_VERSION == 42 ++ shader.coordinate_shader_propagate_nans = true; ++ shader.vertex_shader_propagate_nans = true; ++ shader.fragment_shader_propagate_nans = true; ++ + /* FIXME: Use combined input/output size flag in the common case (also + * on v3d, see v3dx_draw). + */ +-#if V3D_VERSION == 42 + shader.coordinate_shader_has_separate_input_and_output_vpm_blocks = + prog_data_vs_bin->separate_segments; + shader.vertex_shader_has_separate_input_and_output_vpm_blocks = +diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c +index 04cc3bc3ae1..e4b414b0676 100644 +--- a/src/gallium/drivers/v3d/v3dx_draw.c ++++ b/src/gallium/drivers/v3d/v3dx_draw.c +@@ -396,7 +396,9 @@ v3d_emit_gs_state_record(struct v3d_job *job, + gs_bin->prog_data.gs->base.threads == 4; + shader.geometry_bin_mode_shader_start_in_final_thread_section = + gs_bin->prog_data.gs->base.single_seg; ++#if V3D_VERSION <= 42 + shader.geometry_bin_mode_shader_propagate_nans = true; ++#endif + shader.geometry_bin_mode_shader_uniforms_address = + gs_bin_uniforms; + +@@ -406,7 +408,9 @@ v3d_emit_gs_state_record(struct v3d_job *job, + gs->prog_data.gs->base.threads == 4; + shader.geometry_render_mode_shader_start_in_final_thread_section = + gs->prog_data.gs->base.single_seg; ++#if V3D_VERSION <= 42 + shader.geometry_render_mode_shader_propagate_nans = true; ++#endif + shader.geometry_render_mode_shader_uniforms_address = + gs_render_uniforms; + } +@@ -657,10 +661,6 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, + shader.number_of_varyings_in_fragment_shader = + v3d->prog.fs->prog_data.fs->num_inputs; + +- shader.coordinate_shader_propagate_nans = true; +- shader.vertex_shader_propagate_nans = true; +- shader.fragment_shader_propagate_nans = true; +- + shader.coordinate_shader_code_address = + cl_address(v3d_resource(v3d->prog.cs->resource)->bo, + v3d->prog.cs->offset); +@@ -671,10 +671,14 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, + cl_address(v3d_resource(v3d->prog.fs->resource)->bo, + v3d->prog.fs->offset); + ++#if V3D_VERSION <= 42 ++ shader.coordinate_shader_propagate_nans = true; ++ shader.vertex_shader_propagate_nans = true; ++ shader.fragment_shader_propagate_nans = true; ++ + /* XXX: Use combined input/output size flag in the common + * case. + */ +-#if V3D_VERSION <= 42 + shader.coordinate_shader_has_separate_input_and_output_vpm_blocks = + v3d->prog.cs->prog_data.vs->separate_segments; + shader.vertex_shader_has_separate_input_and_output_vpm_blocks = +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0127-v3dv-use-new-texture-shader-state-rb_swap-and-revers.patch b/projects/RPi/devices/RPi5/patches/mesa/0127-v3dv-use-new-texture-shader-state-rb_swap-and-revers.patch new file mode 100644 index 0000000000..81357ea2f9 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0127-v3dv-use-new-texture-shader-state-rb_swap-and-revers.patch @@ -0,0 +1,296 @@ +From c74ba2b39e7b9fe6c5415c20c98cd231d2674df6 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Tue, 16 May 2023 00:38:40 +0200 +Subject: [PATCH 127/142] v3dv: use new texture shader state rb_swap and + reverse fields in v3d 7.x + +In v3d 4.x we handle formats that are reversed or R/B swapped by +applying a format swizzle. This doesn't work on border colors though, +and for that there is a specific bit to reverse the border color in +the texture shader state. + +In v3d 7.x we have new reverse and swap R/B bits and we no longer have +a bit to reverse the border color because the new reverse bit applies +to border texels too. Because of this, we absolutely need to use these +new bits in order to get correct border colors in all cases with these +formats. + +When we enable the reverse and/or swap R/B bits, we are effectively +applying the format swizzle through them, so in these cases we need to +make sure the swizzle we program in the texture shader state is the +view swizzle provided by the API and not the composition of the format +swizzle with the view swizzle like we do in 4.x for all formats. The +same applies to custom border colors: we must not apply the format +swizzle to them for formats that are reversed or R/B swapped, because +again, this format swizzle is already applied through these new bits. + +While we are doing this, we also fully adopt the texture shader state +spec from v3d 7.1.5 for v3d 7.x instead of using a description from +7.1.2 which is incompatible and required the driver to manually pack +some of the bits. +--- + src/broadcom/vulkan/v3dv_device.c | 2 +- + src/broadcom/vulkan/v3dv_image.c | 7 ++-- + src/broadcom/vulkan/v3dv_private.h | 13 ++++++- + src/broadcom/vulkan/v3dvx_device.c | 24 ++++++++++-- + src/broadcom/vulkan/v3dvx_image.c | 56 ++++++++++++++++++---------- + src/broadcom/vulkan/v3dvx_private.h | 3 +- + src/gallium/drivers/v3d/v3dx_state.c | 6 --- + 7 files changed, 75 insertions(+), 36 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c +index 1de9b5ce683..b520bfa0002 100644 +--- a/src/broadcom/vulkan/v3dv_device.c ++++ b/src/broadcom/vulkan/v3dv_device.c +@@ -2989,7 +2989,7 @@ v3dv_CreateSampler(VkDevice _device, + } + } + +- v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo, bc_info); ++ v3dv_X(device, pack_sampler_state)(device, sampler, pCreateInfo, bc_info); + + *pSampler = v3dv_sampler_to_handle(sampler); + +diff --git a/src/broadcom/vulkan/v3dv_image.c b/src/broadcom/vulkan/v3dv_image.c +index ebbd60e4c03..e01e2e1bd19 100644 +--- a/src/broadcom/vulkan/v3dv_image.c ++++ b/src/broadcom/vulkan/v3dv_image.c +@@ -671,7 +671,6 @@ create_image_view(struct v3dv_device *device, + * makes sense to implement swizzle composition using VkSwizzle directly. + */ + VkFormat format; +- uint8_t image_view_swizzle[4]; + if (pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT && + range->aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) { + format = VK_FORMAT_R8G8B8A8_UINT; +@@ -682,11 +681,11 @@ create_image_view(struct v3dv_device *device, + vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, view_swizzle); + + util_format_compose_swizzles(stencil_aspect_swizzle, view_swizzle, +- image_view_swizzle); ++ iview->view_swizzle); + } else { + format = pCreateInfo->format; + vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, +- image_view_swizzle); ++ iview->view_swizzle); + } + + iview->vk.view_format = format; +@@ -711,7 +710,7 @@ create_image_view(struct v3dv_device *device, + + const uint8_t *format_swizzle = + v3dv_get_format_swizzle(device, format, plane); +- util_format_compose_swizzles(format_swizzle, image_view_swizzle, ++ util_format_compose_swizzles(format_swizzle, iview->view_swizzle, + iview->planes[plane].swizzle); + + iview->planes[plane].swap_rb = v3dv_format_swizzle_needs_rb_swap(format_swizzle); +diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h +index a074e0a981c..8adb8873efd 100644 +--- a/src/broadcom/vulkan/v3dv_private.h ++++ b/src/broadcom/vulkan/v3dv_private.h +@@ -776,6 +776,8 @@ struct v3dv_image_view { + + const struct v3dv_format *format; + ++ uint8_t view_swizzle[4]; ++ + uint8_t plane_count; + struct { + uint8_t image_plane; +@@ -786,8 +788,8 @@ struct v3dv_image_view { + uint32_t internal_type; + uint32_t offset; + +- /* Precomputed (composed from createinfo->components and formar swizzle) +- * swizzles to pass in to the shader key. ++ /* Precomputed swizzle (composed from the view swizzle and the format ++ * swizzle). + * + * This could be also included on the descriptor bo, but the shader state + * packet doesn't need it on a bo, so we can just avoid a memory copy +@@ -2358,6 +2360,13 @@ struct v3dv_pipeline { + uint8_t stencil_cfg[2][V3DV_STENCIL_CFG_LENGTH]; + }; + ++static inline bool ++v3dv_texture_shader_state_has_rb_swap_reverse_bits(const struct v3dv_device *device) ++{ ++ return device->devinfo.ver > 71 || ++ (device->devinfo.ver == 71 && device->devinfo.rev >= 5); ++} ++ + static inline VkPipelineBindPoint + v3dv_pipeline_get_binding_point(struct v3dv_pipeline *pipeline) + { +diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c +index 61ad98c1217..1b50d51e19f 100644 +--- a/src/broadcom/vulkan/v3dvx_device.c ++++ b/src/broadcom/vulkan/v3dvx_device.c +@@ -50,6 +50,7 @@ vk_to_v3d_compare_func[] = { + }; + + static union pipe_color_union encode_border_color( ++ const struct v3dv_device *device, + const VkSamplerCustomBorderColorCreateInfoEXT *bc_info) + { + const struct util_format_description *desc = +@@ -76,12 +77,28 @@ static union pipe_color_union encode_border_color( + * colors so we need to fix up the swizzle manually for this case. + */ + uint8_t swizzle[4]; +- if (v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) && ++ const bool v3d_has_reverse_swap_rb_bits = ++ v3dv_texture_shader_state_has_rb_swap_reverse_bits(device); ++ if (!v3d_has_reverse_swap_rb_bits && ++ v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) && + v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle)) { + swizzle[0] = PIPE_SWIZZLE_W; + swizzle[1] = PIPE_SWIZZLE_X; + swizzle[2] = PIPE_SWIZZLE_Y; + swizzle[3] = PIPE_SWIZZLE_Z; ++ } ++ /* In v3d 7.x we no longer have a reverse flag for the border color. Instead ++ * we have to use the new reverse and swap_r/b flags in the texture shader ++ * state which will apply the format swizzle automatically when sampling ++ * the border color too and we should not apply it manually here. ++ */ ++ else if (v3d_has_reverse_swap_rb_bits && ++ (v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle) || ++ v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle))) { ++ swizzle[0] = PIPE_SWIZZLE_X; ++ swizzle[1] = PIPE_SWIZZLE_Y; ++ swizzle[2] = PIPE_SWIZZLE_Z; ++ swizzle[3] = PIPE_SWIZZLE_W; + } else { + memcpy(swizzle, format->planes[0].swizzle, sizeof (swizzle)); + } +@@ -179,7 +196,8 @@ static union pipe_color_union encode_border_color( + } + + void +-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, ++v3dX(pack_sampler_state)(const struct v3dv_device *device, ++ struct v3dv_sampler *sampler, + const VkSamplerCreateInfo *pCreateInfo, + const VkSamplerCustomBorderColorCreateInfoEXT *bc_info) + { +@@ -221,7 +239,7 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, + s.border_color_mode = border_color_mode; + + if (s.border_color_mode == V3D_BORDER_COLOR_FOLLOWS) { +- union pipe_color_union border = encode_border_color(bc_info); ++ union pipe_color_union border = encode_border_color(device, bc_info); + + s.border_color_word_0 = border.ui[0]; + s.border_color_word_1 = border.ui[1]; +diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c +index ae6eaa88d0c..de984e81220 100644 +--- a/src/broadcom/vulkan/v3dvx_image.c ++++ b/src/broadcom/vulkan/v3dvx_image.c +@@ -108,25 +108,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device, + + tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64; + +- bool is_srgb = vk_format_is_srgb(image_view->vk.format); +-#if V3D_VERSION == 42 +- tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse; +-#endif +- +-#if V3D_VERSION == 42 +- tex.srgb = is_srgb; +-#endif +-#if V3D_VERSION >= 71 +- tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; +- +- /* V3D 7.1.5 has array stride starting one bit later than previous +- * V3D versions to make room for the new RB swap bit, but we don't +- * handle that in the CLE parser. +- */ +- if (device->devinfo.rev >= 5) +- tex.array_stride_64_byte_aligned <<= 1; +-#endif +- + /* At this point we don't have the job. That's the reason the first + * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to + * add the bo to the job. This also means that we need to add manually +@@ -138,7 +119,44 @@ pack_texture_shader_state_helper(struct v3dv_device *device, + iplane); + tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset); + ++ bool is_srgb = vk_format_is_srgb(image_view->vk.format); ++ ++ /* V3D 4.x doesn't have the reverse and swap_r/b bits, so we compose ++ * the reverse and/or swap_r/b swizzle from the format table with the ++ * image view swizzle. This, however, doesn't work for border colors, ++ * for that there is the reverse_standard_border_color. ++ * ++ * In v3d 7.x, however, there is no reverse_standard_border_color bit, ++ * since the reverse and swap_r/b bits also affect border colors. It is ++ * because of this that we absolutely need to use these bits with ++ * reversed and swpaped formats, since that's the only way to ensure ++ * correct border colors. In that case we don't want to program the ++ * swizzle to the composition of the format swizzle and the view ++ * swizzle like we do in v3d 4.x, since the format swizzle is applied ++ * via the reverse and swap_r/b bits. ++ */ ++#if V3D_VERSION == 42 ++ tex.srgb = is_srgb; ++ tex.reverse_standard_border_color = ++ image_view->planes[plane].channel_reverse; ++#endif + #if V3D_VERSION >= 71 ++ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; ++ ++ tex.reverse = image_view->planes[plane].channel_reverse; ++ tex.r_b_swap = image_view->planes[plane].swap_rb; ++ ++ if (tex.reverse || tex.r_b_swap) { ++ tex.swizzle_r = ++ v3d_translate_pipe_swizzle(image_view->view_swizzle[0]); ++ tex.swizzle_g = ++ v3d_translate_pipe_swizzle(image_view->view_swizzle[1]); ++ tex.swizzle_b = ++ v3d_translate_pipe_swizzle(image_view->view_swizzle[2]); ++ tex.swizzle_a = ++ v3d_translate_pipe_swizzle(image_view->view_swizzle[3]); ++ } ++ + tex.chroma_offset_x = 1; + tex.chroma_offset_y = 1; + /* See comment in XML field definition for rationale of the shifts */ +diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h +index 1ce4789c5ac..27d6736c0e3 100644 +--- a/src/broadcom/vulkan/v3dvx_private.h ++++ b/src/broadcom/vulkan/v3dvx_private.h +@@ -131,7 +131,8 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color, + /* Used at v3dv_device */ + + void +-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, ++v3dX(pack_sampler_state)(const struct v3dv_device *device, ++ struct v3dv_sampler *sampler, + const VkSamplerCreateInfo *pCreateInfo, + const VkSamplerCustomBorderColorCreateInfoEXT *bc_info); + +diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c +index 970a082aa85..8cca1a5030b 100644 +--- a/src/gallium/drivers/v3d/v3dx_state.c ++++ b/src/gallium/drivers/v3d/v3dx_state.c +@@ -960,12 +960,6 @@ v3d_setup_texture_shader_state(const struct v3d_device_info *devinfo, + /* See comment in XML field definition for rationale of the shifts */ + tex->texture_base_pointer_cb = base_offset >> 6; + tex->texture_base_pointer_cr = base_offset >> 6; +- +- /* V3D 7.1.5 has array stride start at bit 33 instead of bit 32 to +- * make room for the RB swap bit. +- */ +- if (devinfo->rev >= 5) +- tex->array_stride_64_byte_aligned <<= 1; + #endif + + /* Since other platform devices may produce UIF images even +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0128-v3dv-fix-color-write-mask-for-v3d-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0128-v3dv-fix-color-write-mask-for-v3d-7.x.patch new file mode 100644 index 0000000000..c991d19da5 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0128-v3dv-fix-color-write-mask-for-v3d-7.x.patch @@ -0,0 +1,34 @@ +From ef1159ad68e4969992a61b1fcdf9103409f689ca Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 8 Feb 2023 08:41:12 +0100 +Subject: [PATCH 128/142] v3dv: fix color write mask for v3d 7.x + +--- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index 580aeb8ba2b..6827c829934 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -1627,9 +1627,15 @@ v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer) + + struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; ++ uint32_t color_write_mask = ~dynamic->color_write_enable | ++ pipeline->blend.color_write_masks; ++#if V3D_VERSION <= 42 ++ /* Only 4 RTs */ ++ color_write_mask &= 0xffff; ++#endif ++ + cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) { +- mask.mask = (~dynamic->color_write_enable | +- pipeline->blend.color_write_masks) & 0xffff; ++ mask.mask = color_write_mask; + } + + cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0129-v3d-v3dv-fix-depth-bias-for-v3d-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0129-v3d-v3dv-fix-depth-bias-for-v3d-7.x.patch new file mode 100644 index 0000000000..61b2e9a859 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0129-v3d-v3dv-fix-depth-bias-for-v3d-7.x.patch @@ -0,0 +1,68 @@ +From aee0180b79a6a546d1e7263d89ef868016082687 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 8 Feb 2023 09:04:02 +0100 +Subject: [PATCH 129/142] v3d,v3dv: fix depth bias for v3d 7.x + +In v3d 7.x we don't need to scale up depth bias for D16 buffers. +--- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 2 ++ + src/gallium/drivers/v3d/v3dx_emit.c | 3 ++- + src/gallium/drivers/v3d/v3dx_state.c | 4 +++- + 3 files changed, 7 insertions(+), 2 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index 6827c829934..1bd634f5027 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -1499,8 +1499,10 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer) + cl_emit(&job->bcl, DEPTH_OFFSET, bias) { + bias.depth_offset_factor = dynamic->depth_bias.slope_factor; + bias.depth_offset_units = dynamic->depth_bias.constant_factor; ++#if V3D_VERSION <= 42 + if (pipeline->depth_bias.is_z16) + bias.depth_offset_units *= 256.0f; ++#endif + bias.limit = dynamic->depth_bias.depth_bias_clamp; + } + +diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c +index 87e75281dc9..82a45e44f82 100644 +--- a/src/gallium/drivers/v3d/v3dx_emit.c ++++ b/src/gallium/drivers/v3d/v3dx_emit.c +@@ -558,7 +558,8 @@ v3dX(emit_state)(struct pipe_context *pctx) + + if (v3d->dirty & V3D_DIRTY_RASTERIZER && + v3d->rasterizer->base.offset_tri) { +- if (job->zsbuf && ++ if (v3d->screen->devinfo.ver <= 42 && ++ job->zsbuf && + job->zsbuf->format == PIPE_FORMAT_Z16_UNORM) { + cl_emit_prepacked_sized(&job->bcl, + v3d->rasterizer->depth_offset_z16, +diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c +index 8cca1a5030b..a7fad572a2d 100644 +--- a/src/gallium/drivers/v3d/v3dx_state.c ++++ b/src/gallium/drivers/v3d/v3dx_state.c +@@ -111,9 +111,10 @@ v3d_create_rasterizer_state(struct pipe_context *pctx, + #endif + } + +- /* The HW treats polygon offset units based on a Z24 buffer, so we ++ /* V3d 4.x treats polygon offset units based on a Z24 buffer, so we + * need to scale up offset_units if we're only Z16. + */ ++#if V3D_VERSION <= 42 + v3dx_pack(&so->depth_offset_z16, DEPTH_OFFSET, depth) { + depth.depth_offset_factor = cso->offset_scale; + depth.depth_offset_units = cso->offset_units * 256.0; +@@ -121,6 +122,7 @@ v3d_create_rasterizer_state(struct pipe_context *pctx, + depth.limit = cso->offset_clamp; + #endif + } ++#endif + + return so; + } +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0130-v3d-v3dv-fix-compute-for-V3D-7.1.6.patch b/projects/RPi/devices/RPi5/patches/mesa/0130-v3d-v3dv-fix-compute-for-V3D-7.1.6.patch new file mode 100644 index 0000000000..2d0a54aa83 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0130-v3d-v3dv-fix-compute-for-V3D-7.1.6.patch @@ -0,0 +1,141 @@ +From 221d4079c616752b249cefb352268fce5758b578 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Thu, 9 Mar 2023 19:05:19 +0100 +Subject: [PATCH 130/142] v3d,v3dv: fix compute for V3D 7.1.6+ + +--- + src/broadcom/vulkan/v3dv_cmd_buffer.c | 25 +++++++++++++++++++++---- + src/broadcom/vulkan/v3dv_private.h | 3 ++- + src/broadcom/vulkan/v3dv_queue.c | 2 +- + src/gallium/drivers/v3d/v3dx_draw.c | 14 +++++++++++--- + 4 files changed, 35 insertions(+), 9 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c +index 36bd7960985..609c7acfa8f 100644 +--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c +@@ -3816,6 +3816,7 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer) + + void + v3dv_cmd_buffer_rewrite_indirect_csd_job( ++ struct v3dv_device *device, + struct v3dv_csd_indirect_cpu_job_info *info, + const uint32_t *wg_counts) + { +@@ -3835,8 +3836,15 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job( + submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT; + submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT; + +- submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) * +- (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1; ++ uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) * ++ (wg_counts[0] * wg_counts[1] * wg_counts[2]); ++ /* V3D 7.1.6 and later don't subtract 1 from the number of batches */ ++ if (device->devinfo.ver < 71 || ++ (device->devinfo.ver == 71 && device->devinfo.rev < 6)) { ++ submit->cfg[4] = num_batches - 1; ++ } else { ++ submit->cfg[4] = num_batches; ++ } + assert(submit->cfg[4] != ~0); + + if (info->needs_wg_uniform_rewrite) { +@@ -3869,6 +3877,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, + uint32_t **wg_uniform_offsets_out, + uint32_t *wg_size_out) + { ++ struct v3dv_device *device = cmd_buffer->device; + struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline; + assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]); + struct v3dv_shader_variant *cs_variant = +@@ -3927,18 +3936,26 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, + if (wg_size_out) + *wg_size_out = wg_size; + +- submit->cfg[4] = num_batches - 1; ++ /* V3D 7.1.6 and later don't subtract 1 from the number of batches */ ++ if (device->devinfo.ver < 71 || ++ (device->devinfo.ver == 71 && device->devinfo.rev < 6)) { ++ submit->cfg[4] = num_batches - 1; ++ } else { ++ submit->cfg[4] = num_batches; ++ } + assert(submit->cfg[4] != ~0); + + assert(pipeline->shared_data->assembly_bo); + struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo; + + submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset; +- submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; + if (cs_variant->prog_data.base->single_seg) + submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG; + if (cs_variant->prog_data.base->threads == 4) + submit->cfg[5] |= V3D_CSD_CFG5_THREADING; ++ /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved */ ++ if (device->devinfo.ver < 71) ++ submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; + + if (cs_variant->prog_data.cs->shared_size > 0) { + job->csd.shared_memory = +diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h +index 8adb8873efd..2f3ef185126 100644 +--- a/src/broadcom/vulkan/v3dv_private.h ++++ b/src/broadcom/vulkan/v3dv_private.h +@@ -1818,7 +1818,8 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer, + void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer, + struct drm_v3d_submit_tfu *tfu); + +-void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info, ++void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device *device, ++ struct v3dv_csd_indirect_cpu_job_info *info, + const uint32_t *wg_counts); + + void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer, +diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c +index b4aae195180..429d14a9196 100644 +--- a/src/broadcom/vulkan/v3dv_queue.c ++++ b/src/broadcom/vulkan/v3dv_queue.c +@@ -408,7 +408,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue, + + if (memcmp(group_counts, info->csd_job->csd.wg_count, + sizeof(info->csd_job->csd.wg_count)) != 0) { +- v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts); ++ v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts); + } + + return VK_SUCCESS; +diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c +index e4b414b0676..4e1af41d50e 100644 +--- a/src/gallium/drivers/v3d/v3dx_draw.c ++++ b/src/gallium/drivers/v3d/v3dx_draw.c +@@ -1473,8 +1473,15 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info) + submit.cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT; + + +- /* Number of batches the dispatch will invoke (minus 1). */ +- submit.cfg[4] = num_batches - 1; ++ /* Number of batches the dispatch will invoke. ++ * V3D 7.1.6 and later don't subtract 1 from the number of batches ++ */ ++ if (v3d->screen->devinfo.ver < 71 || ++ (v3d->screen->devinfo.ver == 71 && v3d->screen->devinfo.rev < 6)) { ++ submit.cfg[4] = num_batches - 1; ++ } else { ++ submit.cfg[4] = num_batches; ++ } + + /* Make sure we didn't accidentally underflow. */ + assert(submit.cfg[4] != ~0); +@@ -1482,7 +1489,8 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info) + v3d_job_add_bo(job, v3d_resource(v3d->prog.compute->resource)->bo); + submit.cfg[5] = (v3d_resource(v3d->prog.compute->resource)->bo->offset + + v3d->prog.compute->offset); +- submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; ++ if (v3d->screen->devinfo.ver < 71) ++ submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; + if (v3d->prog.compute->prog_data.base->single_seg) + submit.cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG; + if (v3d->prog.compute->prog_data.base->threads == 4) +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0131-broadcom-add-performance-counters-for-V3D-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0131-broadcom-add-performance-counters-for-V3D-7.x.patch new file mode 100644 index 0000000000..b4270672ec --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0131-broadcom-add-performance-counters-for-V3D-7.x.patch @@ -0,0 +1,567 @@ +From be6c7ba62dbdb9c5babd33a518a042dd554679d7 Mon Sep 17 00:00:00 2001 +From: "Juan A. Suarez Romero" +Date: Wed, 22 Feb 2023 09:43:40 +0100 +Subject: [PATCH 131/142] broadcom: add performance counters for V3D 7.x + +Some of the counters need to be defined correctly. + +v2: Remove references to extended performance counters. The hw does + not support them. + +Signed-off-by: Juan A. Suarez Romero +--- + .../common/v3d_performance_counters.h | 108 ++++++++++++++++++ + src/broadcom/simulator/v3d_simulator.c | 8 +- + src/broadcom/simulator/v3dx_simulator.c | 2 +- + src/broadcom/vulkan/meson.build | 1 + + src/broadcom/vulkan/v3dv_private.h | 7 +- + src/broadcom/vulkan/v3dv_query.c | 43 +------ + src/broadcom/vulkan/v3dvx_private.h | 6 + + src/broadcom/vulkan/v3dvx_query.c | 67 +++++++++++ + src/gallium/drivers/v3d/meson.build | 2 +- + src/gallium/drivers/v3d/v3d_query.c | 20 +++- + src/gallium/drivers/v3d/v3d_query.h | 6 - + src/gallium/drivers/v3d/v3dx_context.h | 10 ++ + ...d_query_perfcnt.c => v3dx_query_perfcnt.c} | 12 +- + 13 files changed, 233 insertions(+), 59 deletions(-) + create mode 100644 src/broadcom/vulkan/v3dvx_query.c + rename src/gallium/drivers/v3d/{v3d_query_perfcnt.c => v3dx_query_perfcnt.c} (94%) + +diff --git a/src/broadcom/common/v3d_performance_counters.h b/src/broadcom/common/v3d_performance_counters.h +index 08d750c2cbe..a8f0cff8784 100644 +--- a/src/broadcom/common/v3d_performance_counters.h ++++ b/src/broadcom/common/v3d_performance_counters.h +@@ -28,6 +28,110 @@ + #define V3D_PERFCNT_NAME 1 + #define V3D_PERFCNT_DESCRIPTION 2 + ++#ifndef V3D_VERSION ++# error "The V3D_VERSION macro must be defined" ++#endif ++ ++#if (V3D_VERSION >= 71) ++ ++static const char *v3d_performance_counters[][3] = { ++ {"CORE", "cycle-count", "[CORE] Cycle counter"}, ++ {"CORE", "core-active", "[CORE] Bin/Render/Compute active cycles"}, ++ {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"}, ++ {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"}, ++ {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"}, ++ {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"}, ++ {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"}, ++ {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"}, ++ {"FEP", "FEP-valid-quads", "[FEP] Valid quads"}, ++ {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"}, ++ {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"}, ++ {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"}, ++ {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"}, ++ {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"}, ++ {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"}, ++ {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"}, ++ {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"}, ++ {"PTB", "PTB-primitives-discarded-reversed", "[PTB] Primitives that are discarded because they are reversed"}, ++ {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"}, ++ {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"}, ++ {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"}, ++ {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"}, ++ {"TMU", "TMU-active-cycles", "[TMU] Active cycles"}, ++ {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"}, ++ {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"}, ++ {"TMU", "TMU-cache-x4-active-cycles", "[TMU] Cache active cycles for x4 access"}, ++ {"TMU", "TMU-cache-x4-stalled-cycles", "[TMU] Cache stalled cycles for x4 access"}, ++ {"TMU", "TMU-total-text-quads-x4-access", "[TMU] Total texture cache x4 access"}, ++ {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"}, ++ {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"}, ++ {"L2T", "L2T-local", "[L2T] Local mode access"}, ++ {"L2T", "L2T-writeback", "[L2T] Writeback"}, ++ {"L2T", "L2T-zero", "[L2T] Zero"}, ++ {"L2T", "L2T-merge", "[L2T] Merge"}, ++ {"L2T", "L2T-fill", "[L2T] Fill"}, ++ {"L2T", "L2T-stalls-no-wid", "[L2T] Stalls because no WID available"}, ++ {"L2T", "L2T-stalls-no-rid", "[L2T] Stalls because no RID available"}, ++ {"L2T", "L2T-stalls-queue-full", "[L2T] Stalls because internal queue full"}, ++ {"L2T", "L2T-stalls-wrightback", "[L2T] Stalls because writeback in flight"}, ++ {"L2T", "L2T-stalls-mem", "[L2T] Stalls because AXI blocks read"}, ++ {"L2T", "L2T-stalls-fill", "[L2T] Stalls because fill pending for victim cache-line"}, ++ {"L2T", "L2T-hitq", "[L2T] Sent request via hit queue"}, ++ {"L2T", "L2T-hitq-full", "[L2T] Sent request via main queue because hit queue is full"}, ++ {"L2T", "L2T-stalls-read-data", "[L2T] Stalls because waiting for data from SDRAM"}, ++ {"L2T", "L2T-TMU-read-hits", "[L2T] TMU read hits"}, ++ {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"}, ++ {"L2T", "L2T-VCD-read-hits", "[L2T] VCD read hits"}, ++ {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"}, ++ {"L2T", "L2T-SLC-read-hits", "[L2T] SLC read hits (all slices)"}, ++ {"L2T", "L2T-SLC-read-miss", "[L2T] SLC read misses (all slices)"}, ++ {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"}, ++ {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"}, ++ {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"}, ++ {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"}, ++ {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"}, ++ {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"}, ++ {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"}, ++ {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"}, ++ {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"}, ++ {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"}, ++ {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"}, ++ {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"}, ++ {"CORE", "core-memory-writes", "[CORE] Total memory writes"}, ++ {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"}, ++ {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"}, ++ {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"}, ++ {"CORE", "core-memory-reads", "[CORE] Total memory reads"}, ++ {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"}, ++ {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"}, ++ {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"}, ++ {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"}, ++ {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"}, ++ {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"}, ++ {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"}, ++ {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"}, ++ {"AXI", "AXI-read-trans", "[AXI] Read transaction count"}, ++ {"AXI", "AXI-write-trans", "[AXI] Write transaction count"}, ++ {"AXI", "AXI-read-wait-cycles", "[AXI] Read total wait cycles"}, ++ {"AXI", "AXI-write-wait-cycles", "[AXI] Write total wait cycles"}, ++ {"AXI", "AXI-max-outstanding-reads", "[AXI] Maximium outstanding read transactions"}, ++ {"AXI", "AXI-max-outstanding-writes", "[AXI] Maximum outstanding write transactions"}, ++ {"QPU", "QPU-wait-bubble", "[QPU] Pipeline bubble in qcycles due all threads waiting"}, ++ {"QPU", "QPU-ic-miss-bubble", "[QPU] Pipeline bubble in qcycles due instruction-cache miss"}, ++ {"QPU", "QPU-active", "[QPU] Executed shader instruction"}, ++ {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"}, ++ {"QPU", "QPU-stalls", "[QPU] Stalled qcycles executing shader instruction"}, ++ {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"}, ++ {"QPU", "QPU-stalls-TMU", "[QPU] Stalled qcycles waiting for TMU"}, ++ {"QPU", "QPU-stalls-TLB", "[QPU] Stalled qcycles waiting for TLB"}, ++ {"QPU", "QPU-stalls-VPM", "[QPU] Stalled qcycles waiting for VPM"}, ++ {"QPU", "QPU-stalls-uniforms", "[QPU] Stalled qcycles waiting for uniforms"}, ++ {"QPU", "QPU-stalls-SFU", "[QPU] Stalled qcycles waiting for SFU"}, ++ {"QPU", "QPU-stalls-other", "[QPU] Stalled qcycles waiting for any other reason (vary/W/Z)"}, ++}; ++ ++#elif (V3D_VERSION >= 41) ++ + static const char *v3d_performance_counters[][3] = { + {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"}, + {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"}, +@@ -118,4 +222,8 @@ static const char *v3d_performance_counters[][3] = { + {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"}, + }; + ++#else ++static const char *v3d_performance_counters[][3] = { }; ++#endif ++ + #endif +diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c +index 5cceb1a82cc..36e719296f4 100644 +--- a/src/broadcom/simulator/v3d_simulator.c ++++ b/src/broadcom/simulator/v3d_simulator.c +@@ -92,6 +92,9 @@ static struct v3d_simulator_state { + /** Last performance monitor ID. */ + uint32_t last_perfid; + ++ /** Total performance counters */ ++ uint32_t perfcnt_total; ++ + struct util_dynarray bin_oom; + int refcount; + } sim_state = { +@@ -751,7 +754,7 @@ v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args) + + perfmon->ncounters = args->ncounters; + for (int i = 0; i < args->ncounters; i++) { +- if (args->counters[i] >= V3D_PERFCNT_NUM) { ++ if (args->counters[i] >= sim_state.perfcnt_total) { + ralloc_free(perfmon); + return -EINVAL; + } else { +@@ -918,13 +921,16 @@ v3d_simulator_init_global() + switch(sim_state.ver) { + case 33: + v3d33_simulator_init_regs(sim_state.v3d); ++ sim_state.perfcnt_total = 0; + break; + case 41: + case 42: + v3d41_simulator_init_regs(sim_state.v3d); ++ sim_state.perfcnt_total = 87; + break; + case 71: + v3d71_simulator_init_regs(sim_state.v3d); ++ sim_state.perfcnt_total = 93; + break; + default: + unreachable("Not supported V3D version\n"); +diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c +index 4ea177c9bb7..4520fe75719 100644 +--- a/src/broadcom/simulator/v3dx_simulator.c ++++ b/src/broadcom/simulator/v3dx_simulator.c +@@ -50,7 +50,7 @@ + #include "libs/core/v3d/registers/7.1.5.1/v3d.h" + #else + #if V3D_VERSION == 41 || V3D_VERSION == 42 +-#include "libs/core/v3d/registers/4.1.35.0/v3d.h" ++#include "libs/core/v3d/registers/4.2.14.0/v3d.h" + #else + #include "libs/core/v3d/registers/3.3.0.0/v3d.h" + #endif +diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build +index 3da7364686f..182388a35b4 100644 +--- a/src/broadcom/vulkan/meson.build ++++ b/src/broadcom/vulkan/meson.build +@@ -65,6 +65,7 @@ files_per_version = files( + 'v3dvx_pipeline.c', + 'v3dvx_meta_common.c', + 'v3dvx_pipeline.c', ++ 'v3dvx_query.c', + 'v3dvx_queue.c', + ) + +diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h +index 2f3ef185126..89e2f1c7e5c 100644 +--- a/src/broadcom/vulkan/v3dv_private.h ++++ b/src/broadcom/vulkan/v3dv_private.h +@@ -123,6 +123,9 @@ struct v3d_simulator_file; + /* Minimum required by the Vulkan 1.1 spec */ + #define MAX_MEMORY_ALLOCATION_SIZE (1ull << 30) + ++/* Maximum performance counters number */ ++#define V3D_MAX_PERFCNT 93 ++ + struct v3dv_physical_device { + struct vk_physical_device vk; + +@@ -1210,7 +1213,7 @@ struct v3dv_timestamp_query_cpu_job_info { + }; + + /* Number of perfmons required to handle all supported performance counters */ +-#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_PERFCNT_NUM, \ ++#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_MAX_PERFCNT, \ + DRM_V3D_MAX_PERF_COUNTERS) + + struct v3dv_perf_query { +@@ -1682,7 +1685,7 @@ struct v3dv_query_pool { + /* Only used with performance queries */ + struct { + uint32_t ncounters; +- uint8_t counters[V3D_PERFCNT_NUM]; ++ uint8_t counters[V3D_MAX_PERFCNT]; + + /* V3D has a limit on the number of counters we can track in a + * single performance monitor, so if too many counters are requested +diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c +index 3284c467d74..deb7821f02b 100644 +--- a/src/broadcom/vulkan/v3dv_query.c ++++ b/src/broadcom/vulkan/v3dv_query.c +@@ -23,7 +23,6 @@ + + #include "v3dv_private.h" + +-#include "common/v3d_performance_counters.h" + #include "util/timespec.h" + #include "compiler/nir/nir_builder.h" + +@@ -48,7 +47,7 @@ kperfmon_create(struct v3dv_device *device, + DRM_IOCTL_V3D_PERFMON_CREATE, + &req); + if (ret) +- fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret)); ++ fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret)); + + pool->queries[query].perf.kperfmon_ids[i] = req.id; + } +@@ -303,7 +302,6 @@ v3dv_CreateQueryPool(VkDevice _device, + QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); + + assert(pq_info); +- assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM); + + pool->perfmon.ncounters = pq_info->counterIndexCount; + for (uint32_t i = 0; i < pq_info->counterIndexCount; i++) +@@ -592,7 +590,7 @@ write_performance_query_result(struct v3dv_device *device, + assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); + + struct v3dv_query *q = &pool->queries[query]; +- uint64_t counter_values[V3D_PERFCNT_NUM]; ++ uint64_t counter_values[V3D_MAX_PERFCNT]; + + for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) { + struct drm_v3d_perfmon_get_values req = { +@@ -1284,40 +1282,11 @@ v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( + VkPerformanceCounterKHR *pCounters, + VkPerformanceCounterDescriptionKHR *pCounterDescriptions) + { +- uint32_t desc_count = *pCounterCount; ++ V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice); + +- VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, +- out, pCounters, pCounterCount); +- VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, +- out_desc, pCounterDescriptions, &desc_count); +- +- for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) { +- vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { +- counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR; +- counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR; +- counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR; +- +- unsigned char sha1_result[20]; +- _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME], +- strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]), +- sha1_result); +- +- memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); +- } +- +- vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, +- &out_desc, desc) { +- desc->flags = 0; +- snprintf(desc->name, sizeof(desc->name), "%s", +- v3d_performance_counters[i][V3D_PERFCNT_NAME]); +- snprintf(desc->category, sizeof(desc->category), "%s", +- v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]); +- snprintf(desc->description, sizeof(desc->description), "%s", +- v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]); +- } +- } +- +- return vk_outarray_status(&out); ++ return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount, ++ pCounters, ++ pCounterDescriptions); + } + + VKAPI_ATTR void VKAPI_CALL +diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h +index 27d6736c0e3..0f5887eab93 100644 +--- a/src/broadcom/vulkan/v3dvx_private.h ++++ b/src/broadcom/vulkan/v3dvx_private.h +@@ -324,6 +324,12 @@ v3dX(create_default_attribute_values)(struct v3dv_device *device, + void + v3dX(job_emit_noop)(struct v3dv_job *job); + ++/* Used at v3dv_query */ ++VkResult ++v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount, ++ VkPerformanceCounterKHR *pCounters, ++ VkPerformanceCounterDescriptionKHR *pCounterDescriptions); ++ + /* Used at v3dv_descriptor_set, and other descriptor set utils */ + uint32_t v3dX(descriptor_bo_size)(VkDescriptorType type); + +diff --git a/src/broadcom/vulkan/v3dvx_query.c b/src/broadcom/vulkan/v3dvx_query.c +new file mode 100644 +index 00000000000..e59a1e84ff6 +--- /dev/null ++++ b/src/broadcom/vulkan/v3dvx_query.c +@@ -0,0 +1,67 @@ ++/* ++ * Copyright © 2023 Raspberry Pi Ltd ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "v3dv_private.h" ++ ++#include "common/v3d_performance_counters.h" ++ ++VkResult ++v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount, ++ VkPerformanceCounterKHR *pCounters, ++ VkPerformanceCounterDescriptionKHR *pCounterDescriptions) ++{ ++ uint32_t desc_count = *pCounterCount; ++ ++ VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, ++ out, pCounters, pCounterCount); ++ VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, ++ out_desc, pCounterDescriptions, &desc_count); ++ ++ for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) { ++ vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { ++ counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR; ++ counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR; ++ counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR; ++ ++ unsigned char sha1_result[20]; ++ _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME], ++ strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]), ++ sha1_result); ++ ++ memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); ++ } ++ ++ vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, ++ &out_desc, desc) { ++ desc->flags = 0; ++ snprintf(desc->name, sizeof(desc->name), "%s", ++ v3d_performance_counters[i][V3D_PERFCNT_NAME]); ++ snprintf(desc->category, sizeof(desc->category), "%s", ++ v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]); ++ snprintf(desc->description, sizeof(desc->description), "%s", ++ v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]); ++ } ++ } ++ ++ return vk_outarray_status(&out); ++} +diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build +index b2e748573b7..289473d2ca1 100644 +--- a/src/gallium/drivers/v3d/meson.build ++++ b/src/gallium/drivers/v3d/meson.build +@@ -34,7 +34,6 @@ files_libv3d = files( + 'v3d_query.c', + 'v3d_query.h', + 'v3d_query_pipe.c', +- 'v3d_query_perfcnt.c', + 'v3d_resource.c', + 'v3d_resource.h', + 'v3d_screen.c', +@@ -47,6 +46,7 @@ files_per_version = files( + 'v3dx_emit.c', + 'v3dx_format_table.c', + 'v3dx_job.c', ++ 'v3dx_query_perfcnt.c', + 'v3dx_rcl.c', + 'v3dx_state.c', + 'v3dx_tfu.c', +diff --git a/src/gallium/drivers/v3d/v3d_query.c b/src/gallium/drivers/v3d/v3d_query.c +index db98c89625f..83f82e44a3d 100644 +--- a/src/gallium/drivers/v3d/v3d_query.c ++++ b/src/gallium/drivers/v3d/v3d_query.c +@@ -28,8 +28,11 @@ v3d_get_driver_query_group_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_group_info *info) + { + struct v3d_screen *screen = v3d_screen(pscreen); ++ struct v3d_device_info *devinfo = &screen->devinfo; + +- return v3d_get_driver_query_group_info_perfcnt(screen, index, info); ++ return v3d_X(devinfo, get_driver_query_group_info_perfcnt)(screen, ++ index, ++ info); + } + + int +@@ -37,8 +40,11 @@ v3d_get_driver_query_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_info *info) + { + struct v3d_screen *screen = v3d_screen(pscreen); ++ struct v3d_device_info *devinfo = &screen->devinfo; + +- return v3d_get_driver_query_info_perfcnt(screen, index, info); ++ return v3d_X(devinfo, get_driver_query_info_perfcnt)(screen, ++ index, ++ info); + } + + static struct pipe_query * +@@ -53,9 +59,13 @@ static struct pipe_query * + v3d_create_batch_query(struct pipe_context *pctx, unsigned num_queries, + unsigned *query_types) + { +- return v3d_create_batch_query_perfcnt(v3d_context(pctx), +- num_queries, +- query_types); ++ struct v3d_context *v3d = v3d_context(pctx); ++ struct v3d_screen *screen = v3d->screen; ++ struct v3d_device_info *devinfo = &screen->devinfo; ++ ++ return v3d_X(devinfo, create_batch_query_perfcnt)(v3d_context(pctx), ++ num_queries, ++ query_types); + } + + static void +diff --git a/src/gallium/drivers/v3d/v3d_query.h b/src/gallium/drivers/v3d/v3d_query.h +index 3e1426b8d86..605ed1a12f9 100644 +--- a/src/gallium/drivers/v3d/v3d_query.h ++++ b/src/gallium/drivers/v3d/v3d_query.h +@@ -42,11 +42,5 @@ struct v3d_query + }; + + struct pipe_query *v3d_create_query_pipe(struct v3d_context *v3d, unsigned query_type, unsigned index); +-struct pipe_query *v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries, +- unsigned *query_types); +-int v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index, +- struct pipe_driver_query_group_info *info); +-int v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index, +- struct pipe_driver_query_info *info); + + #endif /* V3D_QUERY_H */ +diff --git a/src/gallium/drivers/v3d/v3dx_context.h b/src/gallium/drivers/v3d/v3dx_context.h +index e0a5cbfb2f3..c487ac3b996 100644 +--- a/src/gallium/drivers/v3d/v3dx_context.h ++++ b/src/gallium/drivers/v3d/v3dx_context.h +@@ -61,3 +61,13 @@ bool v3dX(tfu)(struct pipe_context *pctx, + unsigned int src_layer, + unsigned int dst_layer, + bool for_mipmap); ++ ++int v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen, ++ unsigned index, ++ struct pipe_driver_query_group_info *info); ++int v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen, ++ unsigned index, ++ struct pipe_driver_query_info *info); ++struct pipe_query *v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d, ++ unsigned num_queries, ++ unsigned *query_types); +diff --git a/src/gallium/drivers/v3d/v3d_query_perfcnt.c b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c +similarity index 94% +rename from src/gallium/drivers/v3d/v3d_query_perfcnt.c +rename to src/gallium/drivers/v3d/v3dx_query_perfcnt.c +index e00d84e375f..431aad14b4f 100644 +--- a/src/gallium/drivers/v3d/v3d_query_perfcnt.c ++++ b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c +@@ -52,8 +52,8 @@ kperfmon_destroy(struct v3d_context *v3d, struct v3d_perfmon_state *perfmon) + } + + int +-v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index, +- struct pipe_driver_query_group_info *info) ++v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen, unsigned index, ++ struct pipe_driver_query_group_info *info) + { + if (!screen->has_perfmon) + return 0; +@@ -72,8 +72,8 @@ v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned inde + } + + int +-v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index, +- struct pipe_driver_query_info *info) ++v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen, unsigned index, ++ struct pipe_driver_query_info *info) + { + if (!screen->has_perfmon) + return 0; +@@ -222,8 +222,8 @@ static const struct v3d_query_funcs perfcnt_query_funcs = { + }; + + struct pipe_query * +-v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries, +- unsigned *query_types) ++v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d, unsigned num_queries, ++ unsigned *query_types) + { + struct v3d_query_perfcnt *pquery = NULL; + struct v3d_query *query; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0132-broadcom-simulator-add-per-hw-version-calls.patch b/projects/RPi/devices/RPi5/patches/mesa/0132-broadcom-simulator-add-per-hw-version-calls.patch new file mode 100644 index 0000000000..25d4e26ca4 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0132-broadcom-simulator-add-per-hw-version-calls.patch @@ -0,0 +1,239 @@ +From f7d5b57bca07eb9ba6fb292852e3b5057c0a8b8f Mon Sep 17 00:00:00 2001 +From: "Juan A. Suarez Romero" +Date: Mon, 20 Mar 2023 16:48:51 +0100 +Subject: [PATCH 132/142] broadcom/simulator: add per-hw version calls + +Add a wrapper to allow calling the right simulator function based on the +hardware under simulation. + +Signed-off-by: Juan A. Suarez Romero +--- + src/broadcom/simulator/v3d_simulator.c | 86 ++++--------------------- + src/broadcom/simulator/v3d_simulator.h | 21 ++++++ + src/broadcom/simulator/v3dx_simulator.c | 9 ++- + 3 files changed, 41 insertions(+), 75 deletions(-) + +diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c +index 36e719296f4..c4bbd61abc2 100644 +--- a/src/broadcom/simulator/v3d_simulator.c ++++ b/src/broadcom/simulator/v3d_simulator.c +@@ -439,15 +439,15 @@ v3d_simulator_perfmon_switch(int fd, uint32_t perfid) + + perfmon = v3d_get_simulator_perfmon(fd, file->active_perfid); + if (perfmon) +- v3d41_simulator_perfmon_stop(sim_state.v3d, +- perfmon->ncounters, +- perfmon->values); ++ v3d_X_simulator(perfmon_stop)(sim_state.v3d, ++ perfmon->ncounters, ++ perfmon->values); + + perfmon = v3d_get_simulator_perfmon(fd, perfid); + if (perfmon) +- v3d41_simulator_perfmon_start(sim_state.v3d, +- perfmon->ncounters, +- perfmon->counters); ++ v3d_X_simulator(perfmon_start)(sim_state.v3d, ++ perfmon->ncounters, ++ perfmon->counters); + + file->active_perfid = perfid; + } +@@ -492,21 +492,7 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit) + bin_fd = fd; + + v3d_simulator_perfmon_switch(fd, submit->perfmon_id); +- +- switch(sim_state.ver) { +- case 33: +- v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs); +- break; +- case 41: +- case 42: +- v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs); +- break; +- case 71: +- v3d71_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs); +- break; +- default: +- unreachable("Unsupported V3D version\n"); +- } ++ v3d_X_simulator(submit_cl_ioctl)(sim_state.v3d, submit, file->gmp->ofs); + + util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *, + sim_bo) { +@@ -645,22 +631,6 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args) + return drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, args); + } + +-static int +-v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args) +-{ +- switch(sim_state.ver) { +- case 33: +- return v3d33_simulator_get_param_ioctl(sim_state.v3d, args); +- case 41: +- case 42: +- return v3d41_simulator_get_param_ioctl(sim_state.v3d, args); +- case 71: +- return v3d71_simulator_get_param_ioctl(sim_state.v3d, args); +- default: +- unreachable("Unsupported V3D version\n"); +- } +-} +- + static int + v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args) + { +@@ -672,20 +642,7 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args) + v3d_simulator_copy_in_handle(file, args->bo_handles[2]); + v3d_simulator_copy_in_handle(file, args->bo_handles[3]); + +- switch(sim_state.ver) { +- case 33: +- ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args); +- break; +- case 41: +- case 42: +- ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args); +- break; +- case 71: +- ret = v3d71_simulator_submit_tfu_ioctl(sim_state.v3d, args); +- break; +- default: +- unreachable("Unsupported V3D version\n"); +- } ++ ret = v3d_X_simulator(submit_tfu_ioctl)(sim_state.v3d, args); + + v3d_simulator_copy_out_handle(file, args->bo_handles[0]); + +@@ -712,19 +669,8 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args) + + v3d_simulator_perfmon_switch(fd, args->perfmon_id); + +- switch(sim_state.ver) { +- case 41: +- case 42: +- ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args, +- file->gmp->ofs); +- break; +- case 71: +- ret = v3d71_simulator_submit_csd_ioctl(sim_state.v3d, args, +- file->gmp->ofs); +- break; +- default: +- ret = -1; +- } ++ ret = v3d_X_simulator(submit_csd_ioctl)(sim_state.v3d, args, ++ file->gmp->ofs); + + for (int i = 0; i < args->bo_handle_count; i++) + v3d_simulator_copy_out_handle(file, bo_handles[i]); +@@ -835,7 +781,7 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args) + return 0; + + case DRM_IOCTL_V3D_GET_PARAM: +- return v3d_simulator_get_param_ioctl(fd, args); ++ return v3d_X_simulator(get_param_ioctl)(sim_state.v3d, args); + + case DRM_IOCTL_GEM_CLOSE: + return v3d_simulator_gem_close_ioctl(fd, args); +@@ -918,22 +864,18 @@ v3d_simulator_init_global() + + util_dynarray_init(&sim_state.bin_oom, NULL); + ++ v3d_X_simulator(init_regs)(sim_state.v3d); ++ + switch(sim_state.ver) { +- case 33: +- v3d33_simulator_init_regs(sim_state.v3d); +- sim_state.perfcnt_total = 0; +- break; + case 41: + case 42: +- v3d41_simulator_init_regs(sim_state.v3d); + sim_state.perfcnt_total = 87; + break; + case 71: +- v3d71_simulator_init_regs(sim_state.v3d); + sim_state.perfcnt_total = 93; + break; + default: +- unreachable("Not supported V3D version\n"); ++ sim_state.perfcnt_total = 0; + } + } + +diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h +index 1472c313a03..92305634468 100644 +--- a/src/broadcom/simulator/v3d_simulator.h ++++ b/src/broadcom/simulator/v3d_simulator.h +@@ -59,4 +59,25 @@ uint32_t v3d_simulator_get_mem_free(void); + + #endif + ++/* Helper to call simulator ver specific functions */ ++#define v3d_X_simulator(thing) ({ \ ++ __typeof(&v3d33_simulator_##thing) v3d_X_sim_thing;\ ++ switch (sim_state.ver) { \ ++ case 33: \ ++ case 40: \ ++ v3d_X_sim_thing = &v3d33_simulator_##thing; \ ++ break; \ ++ case 41: \ ++ case 42: \ ++ v3d_X_sim_thing = &v3d41_simulator_##thing; \ ++ break; \ ++ case 71: \ ++ v3d_X_sim_thing = &v3d71_simulator_##thing; \ ++ break; \ ++ default: \ ++ unreachable("Unsupported hardware generation"); \ ++ } \ ++ v3d_X_sim_thing; \ ++}) ++ + #endif +diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c +index 4520fe75719..01cf6b22663 100644 +--- a/src/broadcom/simulator/v3dx_simulator.c ++++ b/src/broadcom/simulator/v3dx_simulator.c +@@ -218,12 +218,12 @@ v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d, + return 0; + } + +-#if V3D_VERSION >= 41 + int + v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, + struct drm_v3d_submit_csd *args, + uint32_t gmp_ofs) + { ++#if V3D_VERSION >= 41 + int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) & + V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET); + g_gmp_ofs = gmp_ofs; +@@ -256,8 +256,10 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, + v3d_flush_caches(v3d); + + return 0; +-} ++#else ++ return -1; + #endif ++} + + int + v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d, +@@ -545,7 +547,8 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d, + #define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x)) + #define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8) + #define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \ +- V3D_PCTR_0_SRC_N_SHIFT(x) + 6)) ++ V3D_PCTR_0_SRC_N_SHIFT(x) + \ ++ V3D_PCTR_0_SRC_0_3_PCTRS0_MSB)) + #endif + + void +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0133-v3dv-expose-fullDrawIndexUint32-in-V3D-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0133-v3dv-expose-fullDrawIndexUint32-in-V3D-7.x.patch new file mode 100644 index 0000000000..8b238d4963 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0133-v3dv-expose-fullDrawIndexUint32-in-V3D-7.x.patch @@ -0,0 +1,35 @@ +From 151c13365703631f88ad77ba07afbd2ba9fa172c Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 31 May 2023 09:23:51 +0200 +Subject: [PATCH 133/142] v3dv: expose fullDrawIndexUint32 in V3D 7.x + +--- + src/broadcom/vulkan/v3dv_device.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c +index b520bfa0002..ca5f676b6f7 100644 +--- a/src/broadcom/vulkan/v3dv_device.c ++++ b/src/broadcom/vulkan/v3dv_device.c +@@ -214,7 +214,7 @@ get_features(const struct v3dv_physical_device *physical_device, + *features = (struct vk_features) { + /* Vulkan 1.0 */ + .robustBufferAccess = true, /* This feature is mandatory */ +- .fullDrawIndexUint32 = false, /* Only available since V3D 4.4.9.1 */ ++ .fullDrawIndexUint32 = physical_device->devinfo.ver >= 71, + .imageCubeArray = true, + .independentBlend = true, + .geometryShader = true, +@@ -1451,7 +1451,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, + .subPixelPrecisionBits = V3D_COORD_SHIFT, + .subTexelPrecisionBits = 8, + .mipmapPrecisionBits = 8, +- .maxDrawIndexedIndexValue = 0x00ffffff, ++ .maxDrawIndexedIndexValue = pdevice->devinfo.ver >= 71 ? ++ 0xffffffff : 0x00ffffff, + .maxDrawIndirectCount = 0x7fffffff, + .maxSamplerLodBias = 14.0f, + .maxSamplerAnisotropy = 16.0f, +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0134-v3dv-expose-depthClamp-in-V3D-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0134-v3dv-expose-depthClamp-in-V3D-7.x.patch new file mode 100644 index 0000000000..6f906ff11d --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0134-v3dv-expose-depthClamp-in-V3D-7.x.patch @@ -0,0 +1,56 @@ +From aec0c613e651984e577f580aedceb3561d6a3b19 Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 31 May 2023 10:38:59 +0200 +Subject: [PATCH 134/142] v3dv: expose depthClamp in V3D 7.x + +--- + src/broadcom/vulkan/v3dv_device.c | 2 +- + src/broadcom/vulkan/v3dvx_pipeline.c | 5 ++++- + 2 files changed, 5 insertions(+), 2 deletions(-) + +diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c +index ca5f676b6f7..30a9894789b 100644 +--- a/src/broadcom/vulkan/v3dv_device.c ++++ b/src/broadcom/vulkan/v3dv_device.c +@@ -224,7 +224,7 @@ get_features(const struct v3dv_physical_device *physical_device, + .logicOp = true, + .multiDrawIndirect = false, + .drawIndirectFirstInstance = true, +- .depthClamp = false, /* Only available since V3D 4.5.1.1 */ ++ .depthClamp = physical_device->devinfo.ver >= 71, + .depthBiasClamp = true, + .fillModeNonSolid = true, + .depthBounds = physical_device->devinfo.ver >= 71, +diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c +index c9b537f4b32..ad22add155d 100644 +--- a/src/broadcom/vulkan/v3dvx_pipeline.c ++++ b/src/broadcom/vulkan/v3dvx_pipeline.c +@@ -243,6 +243,7 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline, + * supported in the driver yet, so in practice we are always enabling Z + * clipping for now. + */ ++ bool z_clamp_enable = rs_info && rs_info->depthClampEnable; + bool z_clip_enable = false; + const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info = + ds_info ? vk_find_struct_const(ds_info->pNext, +@@ -250,7 +251,7 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline, + NULL; + if (clip_info) + z_clip_enable = clip_info->depthClipEnable; +- else if (!(rs_info && rs_info->depthClampEnable)) ++ else if (!z_clamp_enable) + z_clip_enable = true; + + if (z_clip_enable) { +@@ -260,6 +261,8 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline, + config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE; + } + ++ config.z_clamp_mode = z_clamp_enable; ++ + config.depth_bounds_test_enable = + ds_info && ds_info->depthBoundsTestEnable && has_ds_attachment; + #endif +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0135-v3dv-temporary-disable-EXT_acquire_drm_display.patch b/projects/RPi/devices/RPi5/patches/mesa/0135-v3dv-temporary-disable-EXT_acquire_drm_display.patch new file mode 100644 index 0000000000..831de83810 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0135-v3dv-temporary-disable-EXT_acquire_drm_display.patch @@ -0,0 +1,29 @@ +From 6bd92fecf57b5b1ae3f1f665726c4a0c43d3d90e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Tue, 11 Apr 2023 13:11:39 +0200 +Subject: [PATCH 135/142] v3dv/temporary: disable EXT_acquire_drm_display + +So we could made a conformance run, without the need to include the +CTS patch for this issue: + +https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/4377 +--- + src/broadcom/vulkan/v3dv_device.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c +index 30a9894789b..c0ffc05750f 100644 +--- a/src/broadcom/vulkan/v3dv_device.c ++++ b/src/broadcom/vulkan/v3dv_device.c +@@ -91,7 +91,7 @@ static const struct vk_instance_extension_table instance_extensions = { + .KHR_display = true, + .KHR_get_display_properties2 = true, + .EXT_direct_mode_display = true, +- .EXT_acquire_drm_display = true, ++ .EXT_acquire_drm_display = false, + #endif + .KHR_external_fence_capabilities = true, + .KHR_external_memory_capabilities = true, +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0136-v3dv-expose-scalarBlockLayout-on-V3D-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0136-v3dv-expose-scalarBlockLayout-on-V3D-7.x.patch new file mode 100644 index 0000000000..402eb77074 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0136-v3dv-expose-scalarBlockLayout-on-V3D-7.x.patch @@ -0,0 +1,27 @@ +From 7960516490008ab42ab31e921369b1ffb8f67bde Mon Sep 17 00:00:00 2001 +From: Iago Toral Quiroga +Date: Wed, 21 Jun 2023 10:29:07 +0200 +Subject: [PATCH 136/142] v3dv: expose scalarBlockLayout on V3D 7.x + +This version of V3D doesn't have the restriction that vector accesses +must not cross 16-byte boundaries. +--- + src/broadcom/vulkan/v3dv_device.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c +index c0ffc05750f..8f8102ae46e 100644 +--- a/src/broadcom/vulkan/v3dv_device.c ++++ b/src/broadcom/vulkan/v3dv_device.c +@@ -304,7 +304,7 @@ get_features(const struct v3dv_physical_device *physical_device, + * problematic, we would always have to scalarize. Overall, this would + * not lead to best performance so let's just not support it. + */ +- .scalarBlockLayout = false, ++ .scalarBlockLayout = physical_device->devinfo.ver >= 71, + /* This tells applications 2 things: + * + * 1. If they can select just one aspect for barriers. For us barriers +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0137-dri-Limit-the-max_num_back-to-2-on-COMPLETE_MODE_FLI.patch b/projects/RPi/devices/RPi5/patches/mesa/0137-dri-Limit-the-max_num_back-to-2-on-COMPLETE_MODE_FLI.patch new file mode 100644 index 0000000000..5ff628c96d --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0137-dri-Limit-the-max_num_back-to-2-on-COMPLETE_MODE_FLI.patch @@ -0,0 +1,42 @@ +From b58e1d7fd1c315e6ada0ad9ec4961b65c88f0c2a Mon Sep 17 00:00:00 2001 +From: Jose Maria Casanova Crespo +Date: Mon, 4 Oct 2021 14:30:30 +0200 +Subject: [PATCH 137/142] dri: Limit the max_num_back to 2 on + COMPLETE_MODE_FLIP present mode + +This is limiting the number of back buffers that mesa can allocate, so +this avoids triple buffering, although that is desirable in some cases. + +To get this to upstream, we could convert it to a DRI option +and enable it only in the case of using mutter. +It seems to be feasible to limit this to some kind of configuration, as +we have access to the size of the back-buffer allocated. For example, +only limit for 4k-dual screen setup. + +With this Raspberry OS start-up CMA usage is 210Mb with 4k-dual screen +setup instead of 276Mb. + +The correct approach would be to check if we can make Mutter to wait +for buffer swaps before starting a new frame. + +https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7033 +--- + src/loader/loader_dri3_helper.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c +index 32135770e9d..2534c817dcc 100644 +--- a/src/loader/loader_dri3_helper.c ++++ b/src/loader/loader_dri3_helper.c +@@ -275,7 +275,7 @@ dri3_update_max_num_back(struct loader_dri3_drawable *draw) + if (draw->swap_interval == 0) + draw->max_num_back = 4; + else +- draw->max_num_back = 3; ++ draw->max_num_back = 2; + + assert(draw->max_num_back <= LOADER_DRI3_MAX_BACK); + break; +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0138-v3d-Ignore-SCANOUT-usage-flags-when-not-needed-under.patch b/projects/RPi/devices/RPi5/patches/mesa/0138-v3d-Ignore-SCANOUT-usage-flags-when-not-needed-under.patch new file mode 100644 index 0000000000..d1504ba496 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0138-v3d-Ignore-SCANOUT-usage-flags-when-not-needed-under.patch @@ -0,0 +1,369 @@ +From d0f2a99045fa9835fea822ada58a344e2fdc1b13 Mon Sep 17 00:00:00 2001 +From: Jose Maria Casanova Crespo +Date: Thu, 21 Oct 2021 22:04:57 +0200 +Subject: [PATCH 138/142] v3d: Ignore SCANOUT usage flags when not needed under + X + +These downstream patches force the usage of tiled formats +when possible, they have been tested for the Rasbperry Pi OS +desktop enviroment using Mutter+Xserver. + +It includes the following 3 patches: + - v3d: Add driconf options to rewrite SCANOUT usages + - v3d: Check if are under X session + - v3d: enable options to ignore SCANOUT flag on resource creation + +v3d: Add driconf options to rewrite SCANOUT usages + +We create a new eviroment variable V3D_IGNORE_SCANOUT_USAGES +that will affect v3d_resource_create_with_modifiers so +SCANOUT usages can be ignored. It can be enabled under X11 +with a compositor so applications are forces to use tiled render +buffers instead of the default behaviour that uses SCANOUT and +consume the limited CMA memory in the RPi4. + +The two new driconf options modulate the effect on two applications +Xorg and mutter. + +"v3d_maintain_ignorable_scanout": is enabled in mutter, could be used +in other compositors, the objective is that the enviroment has enable +the V3D_IGNORE_SCANOUT_USAGES, they aren't ignored in the compositor. + +"v3d_is_xserver_process": is used to handle a particular case +to avoid checking if an Xserver connection is available using XCB +as in some cases the call stalls the Xserver on boot. + +Following patches will use this configuration options to ignore or not +the SCANOUT usage on v3d_resource_allocation with modifiers. + +Upstreaming this patch need to review the effects of: + ad50b47a14e9 ("gbm: assume USE_SCANOUT in create_with_modifiers") + +v2: driconf for v3d_is_xserver_process is needed under XWayland + to avoid XCB connections in the XWayland process. + +v3d: Check if are under X session + +If we are using Wayland + XWayland, this is considered *not* being under +X session. + +v3d: enable options to ignore SCANOUT flag on resource creation + +This is a downstream patch for enabling the usage of more tiled +buffers in Raspberry OS under an enviroment using mutter and Xorg. + +This patch enables the following behaviour in order to reduce the +number of CMA usage and use tiled layouts because we ignore +the possible SCANOUT usage of the resource. + +This patch makes mutter to not ignore SCANOUT flags because as +compositor it should allocate linear render buffers suitable for display. + +Then if the Xserver has enabled the dmabuf_capable option, the +buffers backing the windows pixmaps will allocate using modifiers, +in the patched Xserver downstream making pixmaps exportable will use +gbm_gbm_bo_create_with_modifiers2 that does not add the SCANOUT flag +for exporting pixmaps. With the Mutter compositor we didn't find a +situation were this pixmaps needed to be SCANOUT. But this is not sure, +but it allows us to not use CMA for every window opened, and having them +in tiled format saves all linear->tiled conversion for sampling. + +Finally to take advantage of using Tiled render buffers for applications +we can enable in the enviroment V3D_IGNORE_SCANOUT_USAGES so all render +targes use the tiled UIF format without CMA memory instead of a linear one. +As the compositor mutter will composite the final surface for display we +aren't going to use the SCANOUT flag. This only applies if we are under +an X11 session. + +v2: v3d: ignore V3D_IGNORE_SCANOUT if only LINEAR modifier available + This is a fixup for the behaviour of ignoring SCANOUT flags + so we don't allocate CMA memory on V3D for render targets under + X11 as UIF isn't included and only LINEAR is a valid modifier + when Xserver is using msdri3. So we cannot ignore the SCANOUT flag. + As the Xserver in this situation is limiting the available modifiers + to linear, we can identify this case just not ignoring the SCANOUT + flag when we can only allocate linear resources. +--- + src/gallium/drivers/v3d/driinfo_v3d.h | 2 + + src/gallium/drivers/v3d/meson.build | 17 +++++--- + src/gallium/drivers/v3d/v3d_resource.c | 31 ++++++++++++-- + src/gallium/drivers/v3d/v3d_screen.c | 59 ++++++++++++++++++++++++++ + src/gallium/drivers/v3d/v3d_screen.h | 6 +++ + src/util/00-mesa-defaults.conf | 3 ++ + src/util/driconf.h | 8 ++++ + 7 files changed, 117 insertions(+), 9 deletions(-) + +diff --git a/src/gallium/drivers/v3d/driinfo_v3d.h b/src/gallium/drivers/v3d/driinfo_v3d.h +index 147ad0b49bd..8f989e8aa57 100644 +--- a/src/gallium/drivers/v3d/driinfo_v3d.h ++++ b/src/gallium/drivers/v3d/driinfo_v3d.h +@@ -2,4 +2,6 @@ + + DRI_CONF_SECTION_MISCELLANEOUS + DRI_CONF_V3D_NONMSAA_TEXTURE_SIZE_LIMIT(false) ++ DRI_CONF_V3D_MAINTAIN_IGNORABLE_SCANOUT(false) ++ DRI_CONF_V3D_IS_XSERVER_PROCESS(false) + DRI_CONF_SECTION_END +diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build +index 289473d2ca1..e47682db1aa 100644 +--- a/src/gallium/drivers/v3d/meson.build ++++ b/src/gallium/drivers/v3d/meson.build +@@ -61,6 +61,16 @@ endif + + v3d_versions = ['33', '42', '71'] + ++v3d_deps = [dep_v3dv3, dep_libdrm, dep_valgrind, idep_nir_headers] ++ ++if with_platform_x11 ++ v3d_deps += dep_xcb ++endif ++ ++if with_platform_wayland ++ v3d_deps += dep_wayland_client ++endif ++ + per_version_libs = [] + foreach ver : v3d_versions + per_version_libs += static_library( +@@ -72,7 +82,7 @@ foreach ver : v3d_versions + ], + c_args : [v3d_args, '-DV3D_VERSION=' + ver], + gnu_symbol_visibility : 'hidden', +- dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind, idep_nir_headers], ++ dependencies : v3d_deps, + ) + + endforeach +@@ -95,10 +105,7 @@ libv3d = static_library( + c_args : [v3d_args], + cpp_args : [v3d_args], + gnu_symbol_visibility : 'hidden', +- dependencies : [ +- dep_v3dv3, dep_libdrm, dep_valgrind, +- idep_nir_headers, idep_mesautil, +- ], ++ dependencies : v3d_deps + idep_mesautil, + link_with: [per_version_libs], + ) + +diff --git a/src/gallium/drivers/v3d/v3d_resource.c b/src/gallium/drivers/v3d/v3d_resource.c +index a0a210ccad5..46de1b16ae0 100644 +--- a/src/gallium/drivers/v3d/v3d_resource.c ++++ b/src/gallium/drivers/v3d/v3d_resource.c +@@ -439,7 +439,7 @@ v3d_resource_get_handle(struct pipe_screen *pscreen, + case WINSYS_HANDLE_TYPE_SHARED: + return v3d_bo_flink(bo, &whandle->handle); + case WINSYS_HANDLE_TYPE_KMS: +- if (screen->ro) { ++ if (screen->ro && rsc->scanout) { + if (renderonly_get_handle(rsc->scanout, whandle)) { + whandle->stride = rsc->slices[0].stride; + return true; +@@ -785,6 +785,27 @@ v3d_resource_setup(struct pipe_screen *pscreen, + return rsc; + } + ++static bool ++v3d_resource_should_scanout(struct pipe_screen *pscreen, ++ const struct pipe_resource *tmpl, ++ const uint64_t *modifiers, ++ int count) ++{ ++ struct v3d_screen *screen = v3d_screen(pscreen); ++ ++ if (tmpl->bind & PIPE_BIND_SCANOUT) { ++ if (screen->maintain_ignorable_scanout) ++ return true; ++ if (screen->has_x_session && screen->ignore_scanout_usages) { ++ if (drm_find_modifier(DRM_FORMAT_MOD_BROADCOM_UIF, ++ modifiers, count)) ++ return false; ++ } ++ return true; ++ } ++ return false; ++} ++ + static struct pipe_resource * + v3d_resource_create_with_modifiers(struct pipe_screen *pscreen, + const struct pipe_resource *tmpl, +@@ -798,6 +819,8 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen, + struct pipe_resource *prsc = &rsc->base; + /* Use a tiled layout if we can, for better 3D performance. */ + bool should_tile = true; ++ bool should_scanout = v3d_resource_should_scanout(pscreen, tmpl, ++ modifiers, count); + + assert(tmpl->target != PIPE_BUFFER || + (tmpl->format == PIPE_FORMAT_NONE || +@@ -827,7 +850,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen, + /* If using the old-school SCANOUT flag, we don't know what the screen + * might support other than linear. Just force linear. + */ +- if (tmpl->bind & PIPE_BIND_SCANOUT) ++ if ((tmpl->bind & PIPE_BIND_SCANOUT) && should_scanout) + should_tile = false; + + /* No user-specified modifier; determine our own. */ +@@ -849,7 +872,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen, + + v3d_setup_slices(rsc, 0, tmpl->bind & PIPE_BIND_SHARED); + +- if (screen->ro && (tmpl->bind & PIPE_BIND_SCANOUT)) { ++ if (screen->ro && should_scanout) { + struct winsys_handle handle; + struct pipe_resource scanout_tmpl = { + .target = prsc->target, +@@ -979,7 +1002,7 @@ v3d_resource_from_handle(struct pipe_screen *pscreen, + } + } + +- if (screen->ro) { ++ if (screen->ro && !rsc->tiled) { + /* Make sure that renderonly has a handle to our buffer in the + * display's fd, so that a later renderonly_get_handle() + * returns correct handles or GEM names. +diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c +index 2225edf85bd..1d4f619d710 100644 +--- a/src/gallium/drivers/v3d/v3d_screen.c ++++ b/src/gallium/drivers/v3d/v3d_screen.c +@@ -47,6 +47,42 @@ + #include "compiler/v3d_compiler.h" + #include "drm-uapi/drm_fourcc.h" + ++#ifdef HAVE_WAYLAND_PLATFORM ++#include ++#endif ++ ++#ifdef HAVE_X11_PLATFORM ++#include ++#endif ++ ++static bool ++check_x_session() ++{ ++ bool xcb_connection = false; ++ ++#ifdef HAVE_WAYLAND_PLATFORM ++ struct wl_display *display; ++ ++ display = wl_display_connect(NULL); ++ ++ if (display) { ++ wl_display_disconnect(display); ++ return xcb_connection; ++ } ++#endif ++ ++#ifdef HAVE_X11_PLATFORM ++ xcb_connection_t *conn; ++ ++ conn = xcb_connect(NULL, NULL); ++ ++ if (!xcb_connection_has_error(conn)) ++ xcb_connection = true; ++ xcb_disconnect(conn); ++#endif ++ return xcb_connection; ++} ++ + static const char * + v3d_screen_get_name(struct pipe_screen *pscreen) + { +@@ -945,6 +981,29 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config, + v3d_has_feature(screen, DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH); + screen->has_perfmon = v3d_has_feature(screen, DRM_V3D_PARAM_SUPPORTS_PERFMON); + ++ screen->ignore_scanout_usages = getenv("V3D_IGNORE_SCANOUT_USAGES"); ++ ++ const char *is_xserver_process = ++ "v3d_is_xserver_process"; ++ screen->is_xserver_process = ++ driCheckOption(config->options, ++ is_xserver_process, ++ DRI_BOOL) && ++ driQueryOptionb(config->options, ++ is_xserver_process); ++ ++ const char *maintain_ignorable_scanout_name = ++ "v3d_maintain_ignorable_scanout"; ++ screen->maintain_ignorable_scanout = ++ driCheckOption(config->options, ++ maintain_ignorable_scanout_name, ++ DRI_BOOL) && ++ driQueryOptionb(config->options, ++ maintain_ignorable_scanout_name); ++ ++ screen->has_x_session = !screen->is_xserver_process && ++ check_x_session(); ++ + v3d_fence_init(screen); + + v3d_process_debug_variable(); +diff --git a/src/gallium/drivers/v3d/v3d_screen.h b/src/gallium/drivers/v3d/v3d_screen.h +index 1da9b83c965..c0f22707075 100644 +--- a/src/gallium/drivers/v3d/v3d_screen.h ++++ b/src/gallium/drivers/v3d/v3d_screen.h +@@ -83,6 +83,12 @@ struct v3d_screen { + bool has_cache_flush; + bool has_perfmon; + bool nonmsaa_texture_size_limit; ++ bool ignore_scanout_usages; ++ bool is_xserver_process; ++ bool maintain_ignorable_scanout; ++ ++ /* Are we running in an X session? */ ++ bool has_x_session; + + struct v3d_simulator_file *sim_file; + +diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf +index 948c1ef78ba..2de7505521c 100644 +--- a/src/util/00-mesa-defaults.conf ++++ b/src/util/00-mesa-defaults.conf +@@ -77,6 +77,7 @@ TODO: document the other workarounds. + + + + + +@@ -767,6 +768,7 @@ TODO: document the other workarounds. + + + + + + + + +diff --git a/src/util/driconf.h b/src/util/driconf.h +index 042ee27d9a3..56511f6615e 100644 +--- a/src/util/driconf.h ++++ b/src/util/driconf.h +@@ -521,6 +521,14 @@ + DRI_CONF_OPT_B(v3d_nonmsaa_texture_size_limit, def, \ + "Report the non-MSAA-only texture size limit") + ++#define DRI_CONF_V3D_IS_XSERVER_PROCESS(def) \ ++ DRI_CONF_OPT_B(v3d_is_xserver_process, def, \ ++ "Identifies if the application is the Xserver.") ++ ++#define DRI_CONF_V3D_MAINTAIN_IGNORABLE_SCANOUT(def) \ ++ DRI_CONF_OPT_B(v3d_maintain_ignorable_scanout, def, \ ++ "Maintain SCANOUT usage on resource allocations when the environment allows ignoring SCANOUT usage.") ++ + /** + * \brief virgl specific configuration options + */ +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0139-Add-a-hack-to-avoid-the-shadow-tex-update-for-import.patch b/projects/RPi/devices/RPi5/patches/mesa/0139-Add-a-hack-to-avoid-the-shadow-tex-update-for-import.patch new file mode 100644 index 0000000000..a453a83892 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0139-Add-a-hack-to-avoid-the-shadow-tex-update-for-import.patch @@ -0,0 +1,117 @@ +From fc1fe85f01a67ef6e5758f1022950ad79b1b305a Mon Sep 17 00:00:00 2001 +From: Neil Roberts +Date: Mon, 5 Jul 2021 20:19:06 +0200 +Subject: [PATCH 139/142] Add a hack to avoid the shadow tex update for + imported linear texs + +This adds a hacky interface so that an application can override the +mechanism used to detect when to update the shadow texture which is used +when importing a linear texture. The application can enable this by +calling: + +glTexParameteri(GL_TEXTURE_2D, GL_SYNC_CONDITION, 1); + +And then whenever it determines that the shadow texture should be +updated it can call: + +glTexParameteri(GL_TEXTURE_2D, GL_SYNC_STATUS, 1); + +(cherry picked from commit 1269e2cfbfa876fdc85037b9435085174d76ad57) +--- + src/gallium/drivers/v3d/v3d_resource.c | 5 ++++- + src/gallium/include/pipe/p_state.h | 4 ++++ + src/mesa/main/mtypes.h | 3 +++ + src/mesa/main/texparam.c | 18 ++++++++++++++++++ + 4 files changed, 29 insertions(+), 1 deletion(-) + +diff --git a/src/gallium/drivers/v3d/v3d_resource.c b/src/gallium/drivers/v3d/v3d_resource.c +index 46de1b16ae0..8e31acb0ff0 100644 +--- a/src/gallium/drivers/v3d/v3d_resource.c ++++ b/src/gallium/drivers/v3d/v3d_resource.c +@@ -1048,7 +1048,9 @@ v3d_update_shadow_texture(struct pipe_context *pctx, + + assert(view->texture != pview->texture); + +- if (shadow->writes == orig->writes && orig->bo->private) ++ if (shadow->writes == orig->writes && ++ orig->base.sync_status == 0 && ++ (orig->bo->private || orig->base.sync_condition)) + return; + + perf_debug("Updating %dx%d@%d shadow for linear texture\n", +@@ -1091,6 +1093,7 @@ v3d_update_shadow_texture(struct pipe_context *pctx, + } + + shadow->writes = orig->writes; ++ orig->base.sync_status = 0; + } + + static struct pipe_surface * +diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h +index 549e4d21c05..abc58552544 100644 +--- a/src/gallium/include/pipe/p_state.h ++++ b/src/gallium/include/pipe/p_state.h +@@ -610,6 +610,10 @@ struct pipe_resource + unsigned bind; /**< bitmask of PIPE_BIND_x */ + unsigned flags; /**< bitmask of PIPE_RESOURCE_FLAG_x */ + ++ /* Hack for avoiding sync on v3d */ ++ unsigned sync_condition; ++ unsigned sync_status; ++ + /** + * For planar images, ie. YUV EGLImage external, etc, pointer to the + * next plane. +diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h +index 77c38bf48d5..1eb2dac8018 100644 +--- a/src/mesa/main/mtypes.h ++++ b/src/mesa/main/mtypes.h +@@ -1058,6 +1058,9 @@ struct gl_texture_object + * the pipe_resource *pt above. + */ + bool needs_validation; ++ ++ /* Hack for avoiding sync on v3d */ ++ GLboolean SyncCondition; + }; + + +diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c +index 001cc185722..139db3ce3e2 100644 +--- a/src/mesa/main/texparam.c ++++ b/src/mesa/main/texparam.c +@@ -274,6 +274,13 @@ set_tex_parameteri(struct gl_context *ctx, + } + + switch (pname) { ++ case GL_SYNC_CONDITION: ++ if (!!texObj->SyncCondition == !!params[0]) ++ return GL_FALSE; ++ texObj->SyncCondition = !!params[0]; ++ return GL_TRUE; ++ case GL_SYNC_STATUS: ++ return GL_TRUE; + case GL_TEXTURE_MIN_FILTER: + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) + goto invalid_dsa; +@@ -931,6 +938,17 @@ _mesa_texture_parameter_invalidate(struct gl_context *ctx, + { + if (texparam_invalidates_sampler_views(pname)) + st_texture_release_all_sampler_views(st_context(ctx), texObj); ++ ++ switch (pname) { ++ case GL_SYNC_CONDITION: ++ texObj->pt->sync_condition = texObj->SyncCondition; ++ break; ++ case GL_SYNC_STATUS: ++ texObj->pt->sync_status = 1; ++ break; ++ default: ++ ; /* nothing */ ++ } + } + + void +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0140-vc4-Fix-mask-RGBA-validation-at-YUV-blit.patch b/projects/RPi/devices/RPi5/patches/mesa/0140-vc4-Fix-mask-RGBA-validation-at-YUV-blit.patch new file mode 100644 index 0000000000..1336841a6a --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0140-vc4-Fix-mask-RGBA-validation-at-YUV-blit.patch @@ -0,0 +1,29 @@ +From 270deb428f1de371492a5e6185fe410c4329eab4 Mon Sep 17 00:00:00 2001 +From: Jose Maria Casanova Crespo +Date: Mon, 25 Sep 2023 21:16:59 +0200 +Subject: [PATCH 140/142] vc4: Fix mask RGBA validation at YUV blit + +Solves regression on video players using GPU for +video decoding that just displays the video in green. + +Fixes: d13da7782cd80 ("vc4: call blit paths in chain") +--- + src/gallium/drivers/vc4/vc4_blit.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c +index 2cf65b5f585..87b2369b7ad 100644 +--- a/src/gallium/drivers/vc4/vc4_blit.c ++++ b/src/gallium/drivers/vc4/vc4_blit.c +@@ -347,7 +347,7 @@ vc4_yuv_blit(struct pipe_context *pctx, struct pipe_blit_info *info) + struct vc4_resource *dst = vc4_resource(info->dst.resource); + bool ok; + +- if (info->mask & PIPE_MASK_RGBA) ++ if (!(info->mask & PIPE_MASK_RGBA)) + return; + + if (src->tiled) +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0141-vc4-mark-buffers-as-initialized-at-vc4_texture_subda.patch b/projects/RPi/devices/RPi5/patches/mesa/0141-vc4-mark-buffers-as-initialized-at-vc4_texture_subda.patch new file mode 100644 index 0000000000..e969ec933b --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0141-vc4-mark-buffers-as-initialized-at-vc4_texture_subda.patch @@ -0,0 +1,175 @@ +From f843fbceb381f8c82074e8b68583fbfe57c48a6e Mon Sep 17 00:00:00 2001 +From: Jose Maria Casanova Crespo +Date: Thu, 8 Jun 2023 00:57:15 +0200 +Subject: [PATCH 141/142] vc4: mark buffers as initialized at + vc4_texture_subdata + +This fixes several tests when the initially uploaded buffer +from CPU was being ignored because vc4_texture_subdata was not +marking the resource as written/initialized. + +The usage flags management available at vc4_resource_transfer_map +is generalized into vc4_map_usage_prep and reused at +vc4_resource_transfer_map. This makes vc4 implementation more similar +to v3d. + +This fixes 7 text in the following subgroups: + -dEQP-GLES2.functional.fbo.render.texsubimage.* + -dEQP-GLES2.functional.texture.specification.basic_copytexsubimage2d.* + -spec@arb_clear_texture@arb_clear_texture-* + +Cc: mesa-stable +Reviewed-by: Juan A. Suarez +Reviewed-by: Emma Anholt +Part-of: +--- + src/broadcom/ci/broadcom-rpi3-fails.txt | 11 ---- + src/gallium/drivers/vc4/vc4_resource.c | 71 +++++++++++++++---------- + 2 files changed, 44 insertions(+), 38 deletions(-) + +diff --git a/src/broadcom/ci/broadcom-rpi3-fails.txt b/src/broadcom/ci/broadcom-rpi3-fails.txt +index 5522310d91a..e49e77b1436 100644 +--- a/src/broadcom/ci/broadcom-rpi3-fails.txt ++++ b/src/broadcom/ci/broadcom-rpi3-fails.txt +@@ -18,11 +18,6 @@ dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail + + dEQP-GLES2.functional.depth_stencil_clear.depth_stencil_masked,Fail + +-# A glTexImage, glDraw, glTexSubImage sequence into a texture is missing what looks like the drawing. +-dEQP-GLES2.functional.fbo.render.texsubimage.after_render_tex2d_rgba,Fail +-# A glTexImage, glDraw, glTexSubImage, glDraw sequence into a texture is missing what looks like the first drawing. +-dEQP-GLES2.functional.fbo.render.texsubimage.between_render_tex2d_rgba,Fail +- + # Sampling grid slightly off in test 2? + dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_mirror_rgba8888,Fail + dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_repeat_rgba8888,Fail +@@ -38,12 +33,6 @@ dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_clamp_non_square,Fa + dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_mirror_non_square,Fail + dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_repeat_non_square,Fail + +-# Sequence of glTexImage, glDraw, glCopyTexSubImage. +-# background red/green checkerboard on the left side is incorrectly white. +-dEQP-GLES2.functional.texture.specification.basic_copytexsubimage2d.2d_rgba,Fail +-# Maybe it was copied as RGB instead of RGBA? +-dEQP-GLES2.functional.texture.specification.basic_copytexsubimage2d.cube_rgba,Fail +- + # One of the pixels on the left edge near the bottom is wrong for both min and + # mag. Also a line of pixels through the image in minification. + dEQP-GLES2.functional.texture.wrap.clamp_clamp_nearest_npot_etc1,Fail +diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c +index ad2791aa972..0a3a435a46c 100644 +--- a/src/gallium/drivers/vc4/vc4_resource.c ++++ b/src/gallium/drivers/vc4/vc4_resource.c +@@ -95,34 +95,13 @@ vc4_resource_transfer_unmap(struct pipe_context *pctx, + slab_free(&vc4->transfer_pool, ptrans); + } + +-static void * +-vc4_resource_transfer_map(struct pipe_context *pctx, +- struct pipe_resource *prsc, +- unsigned level, unsigned usage, +- const struct pipe_box *box, +- struct pipe_transfer **pptrans) ++static void ++vc4_map_usage_prep(struct pipe_context *pctx, ++ struct pipe_resource *prsc, ++ unsigned usage) + { + struct vc4_context *vc4 = vc4_context(pctx); + struct vc4_resource *rsc = vc4_resource(prsc); +- struct vc4_transfer *trans; +- struct pipe_transfer *ptrans; +- enum pipe_format format = prsc->format; +- char *buf; +- +- /* Upgrade DISCARD_RANGE to WHOLE_RESOURCE if the whole resource is +- * being mapped. +- */ +- if ((usage & PIPE_MAP_DISCARD_RANGE) && +- !(usage & PIPE_MAP_UNSYNCHRONIZED) && +- !(prsc->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) && +- prsc->last_level == 0 && +- prsc->width0 == box->width && +- prsc->height0 == box->height && +- prsc->depth0 == box->depth && +- prsc->array_size == 1 && +- rsc->bo->private) { +- usage |= PIPE_MAP_DISCARD_WHOLE_RESOURCE; +- } + + if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE) { + if (vc4_resource_bo_alloc(rsc)) { +@@ -131,6 +110,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx, + */ + if (prsc->bind & PIPE_BIND_VERTEX_BUFFER) + vc4->dirty |= VC4_DIRTY_VTXBUF; ++ if (prsc->bind & PIPE_BIND_CONSTANT_BUFFER) ++ vc4->dirty |= VC4_DIRTY_CONSTBUF; + } else { + /* If we failed to reallocate, flush users so that we + * don't violate any syncing requirements. +@@ -139,7 +120,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx, + } + } else if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) { + /* If we're writing and the buffer is being used by the CL, we +- * have to flush the CL first. If we're only reading, we need ++ * have to flush the CL first. If we're only reading, we need + * to flush if the CL has written our buffer. + */ + if (usage & PIPE_MAP_WRITE) +@@ -152,6 +133,38 @@ vc4_resource_transfer_map(struct pipe_context *pctx, + rsc->writes++; + rsc->initialized_buffers = ~0; + } ++} ++ ++static void * ++vc4_resource_transfer_map(struct pipe_context *pctx, ++ struct pipe_resource *prsc, ++ unsigned level, unsigned usage, ++ const struct pipe_box *box, ++ struct pipe_transfer **pptrans) ++{ ++ struct vc4_context *vc4 = vc4_context(pctx); ++ struct vc4_resource *rsc = vc4_resource(prsc); ++ struct vc4_transfer *trans; ++ struct pipe_transfer *ptrans; ++ enum pipe_format format = prsc->format; ++ char *buf; ++ ++ /* Upgrade DISCARD_RANGE to WHOLE_RESOURCE if the whole resource is ++ * being mapped. ++ */ ++ if ((usage & PIPE_MAP_DISCARD_RANGE) && ++ !(usage & PIPE_MAP_UNSYNCHRONIZED) && ++ !(prsc->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) && ++ prsc->last_level == 0 && ++ prsc->width0 == box->width && ++ prsc->height0 == box->height && ++ prsc->depth0 == box->depth && ++ prsc->array_size == 1 && ++ rsc->bo->private) { ++ usage |= PIPE_MAP_DISCARD_WHOLE_RESOURCE; ++ } ++ ++ vc4_map_usage_prep(pctx, prsc, usage); + + trans = slab_zalloc(&vc4->transfer_pool); + if (!trans) +@@ -240,8 +253,12 @@ vc4_texture_subdata(struct pipe_context *pctx, + } + + /* Otherwise, map and store the texture data directly into the tiled +- * texture. ++ * texture. Note that gallium's texture_subdata may be called with ++ * obvious usage flags missing! + */ ++ vc4_map_usage_prep(pctx, prsc, usage | (PIPE_MAP_WRITE | ++ PIPE_MAP_DISCARD_RANGE)); ++ + void *buf; + if (usage & PIPE_MAP_UNSYNCHRONIZED) + buf = vc4_bo_map_unsynchronized(rsc->bo); +-- +2.39.2 + diff --git a/projects/RPi/devices/RPi5/patches/mesa/0142-gallium-Add-kmsro-drivers-for-RP1-DSI-DPI-and-VEC-de.patch b/projects/RPi/devices/RPi5/patches/mesa/0142-gallium-Add-kmsro-drivers-for-RP1-DSI-DPI-and-VEC-de.patch new file mode 100644 index 0000000000..4055fc4658 --- /dev/null +++ b/projects/RPi/devices/RPi5/patches/mesa/0142-gallium-Add-kmsro-drivers-for-RP1-DSI-DPI-and-VEC-de.patch @@ -0,0 +1,43 @@ +From 3322c102282cf726ae575b122358060abd5b24db Mon Sep 17 00:00:00 2001 +From: Dave Stevenson +Date: Thu, 5 Oct 2023 19:32:10 +0100 +Subject: [PATCH 142/142] gallium: Add kmsro drivers for RP1 DSI, DPI, and VEC + devices + +Signed-off-by: Dave Stevenson +--- + src/gallium/targets/dri/meson.build | 3 +++ + src/gallium/targets/dri/target.c | 3 +++ + 2 files changed, 6 insertions(+) + +diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build +index fbec1da957b..59daf3b6fb6 100644 +--- a/src/gallium/targets/dri/meson.build ++++ b/src/gallium/targets/dri/meson.build +@@ -68,6 +68,9 @@ libgallium_dri = shared_library( + + foreach d : [[with_gallium_kmsro, [ + 'armada-drm_dri.so', ++ 'drm-rp1-dpi_dri.so', ++ 'drm-rp1-dsi_dri.so', ++ 'drm-rp1-vec_dri.so', + 'exynos_dri.so', + 'hx8357d_dri.so', + 'ili9225_dri.so', +diff --git a/src/gallium/targets/dri/target.c b/src/gallium/targets/dri/target.c +index d506869cbb4..ecb25edd03b 100644 +--- a/src/gallium/targets/dri/target.c ++++ b/src/gallium/targets/dri/target.c +@@ -98,6 +98,9 @@ DEFINE_LOADER_DRM_ENTRYPOINT(tegra); + + #if defined(GALLIUM_KMSRO) + DEFINE_LOADER_DRM_ENTRYPOINT(armada_drm) ++DEFINE_LOADER_DRM_ENTRYPOINT(drm_rp1_dpi) ++DEFINE_LOADER_DRM_ENTRYPOINT(drm_rp1_dsi) ++DEFINE_LOADER_DRM_ENTRYPOINT(drm_rp1_vec) + DEFINE_LOADER_DRM_ENTRYPOINT(exynos) + DEFINE_LOADER_DRM_ENTRYPOINT(hx8357d) + DEFINE_LOADER_DRM_ENTRYPOINT(ili9225) +-- +2.39.2 + diff --git a/projects/RPi/options b/projects/RPi/options index 6088a1d474..d12ddc4b2f 100644 --- a/projects/RPi/options +++ b/projects/RPi/options @@ -77,6 +77,9 @@ # default: default mainline kernel LINUX="raspberrypi" + # use framebuffer console + EXTRA_CMDLINE="console=tty0" + ################################################################################ # setup build defaults ################################################################################ diff --git a/projects/Rockchip/devices/RK3288/linux/default/linux.arm.conf b/projects/Rockchip/devices/RK3288/linux/default/linux.arm.conf index e7f96e9f50..cd28869656 100644 --- a/projects/Rockchip/devices/RK3288/linux/default/linux.arm.conf +++ b/projects/Rockchip/devices/RK3288/linux/default/linux.arm.conf @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/arm 6.1.0-rc6 Kernel Configuration +# Linux/arm 6.1.57 Kernel Configuration # CONFIG_CC_VERSION_TEXT="armv7ve-libreelec-linux-gnueabihf-gcc-12.2.0 (GCC) 12.2.0" CONFIG_CC_IS_GCC=y @@ -824,6 +824,7 @@ CONFIG_KMAP_LOCAL_NON_LINEAR_PTE_ARRAY=y # CONFIG_ANON_VMA_NAME is not set # CONFIG_USERFAULTFD is not set # CONFIG_LRU_GEN is not set +CONFIG_LOCK_MM_AND_FIND_VMA=y # # Data Access Monitoring @@ -1715,7 +1716,7 @@ CONFIG_SCSI_PROC_FS=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set CONFIG_BLK_DEV_SR=y -# CONFIG_CHR_DEV_SG is not set +CONFIG_CHR_DEV_SG=m CONFIG_BLK_DEV_BSG=y # CONFIG_CHR_DEV_SCH is not set # CONFIG_SCSI_CONSTANTS is not set @@ -3638,9 +3639,7 @@ CONFIG_MEDIA_ATTACH=y # CONFIG_VIDEO_IR_I2C=y -# -# Camera sensor devices -# +CONFIG_VIDEO_CAMERA_SENSOR=y # CONFIG_VIDEO_AR0521 is not set # CONFIG_VIDEO_HI556 is not set # CONFIG_VIDEO_HI846 is not set @@ -3707,7 +3706,6 @@ CONFIG_VIDEO_OV7640=m # CONFIG_VIDEO_CCS is not set # CONFIG_VIDEO_ET8EK8 is not set # CONFIG_VIDEO_M5MOLS is not set -# end of Camera sensor devices # # Lens drivers @@ -6400,7 +6398,7 @@ CONFIG_CIFS_DEBUG=y CONFIG_CIFS_FSCACHE=y # CONFIG_CIFS_ROOT is not set # CONFIG_SMB_SERVER is not set -CONFIG_SMBFS_COMMON=y +CONFIG_SMBFS=y # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set CONFIG_NLS=y diff --git a/projects/Rockchip/devices/RK3328/linux/default/linux.aarch64.conf b/projects/Rockchip/devices/RK3328/linux/default/linux.aarch64.conf index 3357ef60ea..dbbca81906 100644 --- a/projects/Rockchip/devices/RK3328/linux/default/linux.aarch64.conf +++ b/projects/Rockchip/devices/RK3328/linux/default/linux.aarch64.conf @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/arm64 6.1.0-rc6 Kernel Configuration +# Linux/arm64 6.1.57 Kernel Configuration # CONFIG_CC_VERSION_TEXT="aarch64-none-elf-gcc-12.2.0 (GCC) 12.2.0" CONFIG_CC_IS_GCC=y @@ -330,6 +330,7 @@ CONFIG_ARCH_ROCKCHIP=y # # ARM errata workarounds via the alternatives framework # +# CONFIG_AMPERE_ERRATUM_AC03_CPU_38 is not set CONFIG_ARM64_WORKAROUND_CLEAN_CACHE=y CONFIG_ARM64_ERRATUM_826319=y CONFIG_ARM64_ERRATUM_827319=y @@ -356,6 +357,7 @@ CONFIG_ARM64_LD_HAS_FIX_ERRATUM_843419=y # CONFIG_ARM64_ERRATUM_2054223 is not set # CONFIG_ARM64_ERRATUM_2067961 is not set # CONFIG_ARM64_ERRATUM_2441009 is not set +# CONFIG_ARM64_ERRATUM_2966298 is not set # CONFIG_CAVIUM_ERRATUM_22375 is not set # CONFIG_CAVIUM_ERRATUM_23154 is not set # CONFIG_CAVIUM_ERRATUM_27456 is not set @@ -905,6 +907,7 @@ CONFIG_SECRETMEM=y # CONFIG_ANON_VMA_NAME is not set # CONFIG_USERFAULTFD is not set # CONFIG_LRU_GEN is not set +CONFIG_LOCK_MM_AND_FIND_VMA=y # # Data Access Monitoring @@ -1756,7 +1759,7 @@ CONFIG_SCSI_DMA=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set CONFIG_BLK_DEV_SR=y -# CONFIG_CHR_DEV_SG is not set +CONFIG_CHR_DEV_SG=m CONFIG_BLK_DEV_BSG=y # CONFIG_CHR_DEV_SCH is not set # CONFIG_SCSI_CONSTANTS is not set @@ -3534,9 +3537,7 @@ CONFIG_MEDIA_ATTACH=y # CONFIG_VIDEO_IR_I2C=y -# -# Camera sensor devices -# +CONFIG_VIDEO_CAMERA_SENSOR=y # CONFIG_VIDEO_AR0521 is not set # CONFIG_VIDEO_HI556 is not set # CONFIG_VIDEO_HI846 is not set @@ -3603,7 +3604,6 @@ CONFIG_VIDEO_OV7640=m # CONFIG_VIDEO_CCS is not set # CONFIG_VIDEO_ET8EK8 is not set # CONFIG_VIDEO_M5MOLS is not set -# end of Camera sensor devices # # Lens drivers @@ -6228,7 +6228,7 @@ CONFIG_CIFS_DEBUG=y CONFIG_CIFS_FSCACHE=y # CONFIG_CIFS_ROOT is not set # CONFIG_SMB_SERVER is not set -CONFIG_SMBFS_COMMON=y +CONFIG_SMBFS=y # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set CONFIG_NLS=y diff --git a/projects/Rockchip/devices/RK3399/linux/default/linux.aarch64.conf b/projects/Rockchip/devices/RK3399/linux/default/linux.aarch64.conf index 1f3b91f485..4b0fcb72a7 100644 --- a/projects/Rockchip/devices/RK3399/linux/default/linux.aarch64.conf +++ b/projects/Rockchip/devices/RK3399/linux/default/linux.aarch64.conf @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/arm64 6.1.0-rc6 Kernel Configuration +# Linux/arm64 6.1.57 Kernel Configuration # CONFIG_CC_VERSION_TEXT="aarch64-none-elf-gcc-12.2.0 (GCC) 12.2.0" CONFIG_CC_IS_GCC=y @@ -329,6 +329,7 @@ CONFIG_ARCH_ROCKCHIP=y # # ARM errata workarounds via the alternatives framework # +# CONFIG_AMPERE_ERRATUM_AC03_CPU_38 is not set CONFIG_ARM64_WORKAROUND_CLEAN_CACHE=y CONFIG_ARM64_ERRATUM_826319=y CONFIG_ARM64_ERRATUM_827319=y @@ -356,6 +357,7 @@ CONFIG_ARM64_ERRATUM_1319367=y # CONFIG_ARM64_ERRATUM_2054223 is not set # CONFIG_ARM64_ERRATUM_2067961 is not set # CONFIG_ARM64_ERRATUM_2441009 is not set +# CONFIG_ARM64_ERRATUM_2966298 is not set # CONFIG_CAVIUM_ERRATUM_22375 is not set # CONFIG_CAVIUM_ERRATUM_23154 is not set # CONFIG_CAVIUM_ERRATUM_27456 is not set @@ -906,6 +908,7 @@ CONFIG_SECRETMEM=y # CONFIG_ANON_VMA_NAME is not set # CONFIG_USERFAULTFD is not set # CONFIG_LRU_GEN is not set +CONFIG_LOCK_MM_AND_FIND_VMA=y # # Data Access Monitoring @@ -1862,7 +1865,7 @@ CONFIG_SCSI_DMA=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set CONFIG_BLK_DEV_SR=y -# CONFIG_CHR_DEV_SG is not set +CONFIG_CHR_DEV_SG=m CONFIG_BLK_DEV_BSG=y # CONFIG_CHR_DEV_SCH is not set # CONFIG_SCSI_CONSTANTS is not set @@ -4070,9 +4073,7 @@ CONFIG_MEDIA_ATTACH=y # CONFIG_VIDEO_IR_I2C=y -# -# Camera sensor devices -# +CONFIG_VIDEO_CAMERA_SENSOR=y # CONFIG_VIDEO_AR0521 is not set # CONFIG_VIDEO_HI556 is not set # CONFIG_VIDEO_HI846 is not set @@ -4139,7 +4140,6 @@ CONFIG_VIDEO_OV7640=m # CONFIG_VIDEO_CCS is not set # CONFIG_VIDEO_ET8EK8 is not set # CONFIG_VIDEO_M5MOLS is not set -# end of Camera sensor devices # # Lens drivers @@ -7009,7 +7009,7 @@ CONFIG_CIFS_DEBUG=y CONFIG_CIFS_FSCACHE=y # CONFIG_CIFS_ROOT is not set # CONFIG_SMB_SERVER is not set -CONFIG_SMBFS_COMMON=y +CONFIG_SMBFS=y # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set CONFIG_NLS=y diff --git a/projects/Rockchip/patches/linux/default/linux-0002-rockchip-from-list.patch b/projects/Rockchip/patches/linux/default/linux-0002-rockchip-from-list.patch index 21da17d29d..de24c472b0 100644 --- a/projects/Rockchip/patches/linux/default/linux-0002-rockchip-from-list.patch +++ b/projects/Rockchip/patches/linux/default/linux-0002-rockchip-from-list.patch @@ -1,78 +1,3 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Jonas Karlman -Date: Sat, 10 Oct 2020 15:32:18 +0000 -Subject: [PATCH] phy/rockchip: inno-hdmi: use correct vco_div_5 macro on - rk3328 - -inno_hdmi_phy_rk3328_clk_set_rate() is using the RK3228 macro -when configuring vco_div_5 on RK3328. - -Fix this by using correct vco_div_5 macro for RK3328. - -Fixes: 53706a116863 ("phy: add Rockchip Innosilicon hdmi phy") -Signed-off-by: Jonas Karlman ---- - drivers/phy/rockchip/phy-rockchip-inno-hdmi.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c b/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c -index 80acca4e9e14..15339338aae3 100644 ---- a/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c -+++ b/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c -@@ -790,8 +790,8 @@ static int inno_hdmi_phy_rk3328_clk_set_rate(struct clk_hw *hw, - RK3328_PRE_PLL_POWER_DOWN); - - /* Configure pre-pll */ -- inno_update_bits(inno, 0xa0, RK3228_PCLK_VCO_DIV_5_MASK, -- RK3228_PCLK_VCO_DIV_5(cfg->vco_div_5_en)); -+ inno_update_bits(inno, 0xa0, RK3328_PCLK_VCO_DIV_5_MASK, -+ RK3328_PCLK_VCO_DIV_5(cfg->vco_div_5_en)); - inno_write(inno, 0xa1, RK3328_PRE_PLL_PRE_DIV(cfg->prediv)); - - val = RK3328_SPREAD_SPECTRUM_MOD_DISABLE; - -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Zheng Yang -Date: Sat, 10 Oct 2020 15:32:18 +0000 -Subject: [PATCH] phy/rockchip: inno-hdmi: round fractal pixclock in rk3328 - recalc_rate - -inno_hdmi_phy_rk3328_clk_recalc_rate() is returning a rate not found -in the pre pll config table when the fractal divider is used. -This can prevent proper power_on because a tmdsclock for the new rate -is not found in the pre pll config table. - -Fix this by saving and returning a rounded pixel rate that exist -in the pre pll config table. - -Fixes: 53706a116863 ("phy: add Rockchip Innosilicon hdmi phy") -Signed-off-by: Zheng Yang -Signed-off-by: Jonas Karlman ---- - drivers/phy/rockchip/phy-rockchip-inno-hdmi.c | 8 +++++--- - 1 file changed, 5 insertions(+), 3 deletions(-) - -diff --git a/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c b/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c -index 15339338aae3..15a008a1ac7b 100644 ---- a/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c -+++ b/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c -@@ -745,10 +745,12 @@ unsigned long inno_hdmi_phy_rk3328_clk_recalc_rate(struct clk_hw *hw, - do_div(vco, (nd * (no_a == 1 ? no_b : no_a) * no_d * 2)); - } - -- inno->pixclock = vco; -- dev_dbg(inno->dev, "%s rate %lu\n", __func__, inno->pixclock); -+ inno->pixclock = DIV_ROUND_CLOSEST((unsigned long)vco, 1000) * 1000; - -- return vco; -+ dev_dbg(inno->dev, "%s rate %lu vco %llu\n", -+ __func__, inno->pixclock, vco); -+ -+ return inno->pixclock; - } - - static long inno_hdmi_phy_rk3328_clk_round_rate(struct clk_hw *hw, - From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Jonas Karlman Date: Sat, 10 Oct 2020 15:32:19 +0000 @@ -110,53 +35,6 @@ index 15a008a1ac7b..4b936ca19920 100644 do_div(vco, (nd * (no_a == 1 ? no_b : no_a) * no_d * 2)); -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Jonas Karlman -Date: Sat, 10 Oct 2020 15:32:19 +0000 -Subject: [PATCH] phy/rockchip: inno-hdmi: do not power on rk3328 post pll on - reg write - -inno_write is used to configure 0xaa reg, that also hold the -POST_PLL_POWER_DOWN bit. -When POST_PLL_REFCLK_SEL_TMDS is configured the power down bit is not -taken into consideration. - -Fix this by keeping the power down bit until configuration is complete. -Also reorder the reg write order for consistency. - -Fixes: 53706a116863 ("phy: add Rockchip Innosilicon hdmi phy") -Signed-off-by: Jonas Karlman ---- - drivers/phy/rockchip/phy-rockchip-inno-hdmi.c | 6 ++++-- - 1 file changed, 4 insertions(+), 2 deletions(-) - -diff --git a/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c b/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c -index 4b936ca19920..620961fcfc1d 100644 ---- a/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c -+++ b/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c -@@ -1020,9 +1020,10 @@ inno_hdmi_phy_rk3328_power_on(struct inno_hdmi_phy *inno, - - inno_write(inno, 0xac, RK3328_POST_PLL_FB_DIV_7_0(cfg->fbdiv)); - if (cfg->postdiv == 1) { -- inno_write(inno, 0xaa, RK3328_POST_PLL_REFCLK_SEL_TMDS); - inno_write(inno, 0xab, RK3328_POST_PLL_FB_DIV_8(cfg->fbdiv) | - RK3328_POST_PLL_PRE_DIV(cfg->prediv)); -+ inno_write(inno, 0xaa, RK3328_POST_PLL_REFCLK_SEL_TMDS | -+ RK3328_POST_PLL_POWER_DOWN); - } else { - v = (cfg->postdiv / 2) - 1; - v &= RK3328_POST_PLL_POST_DIV_MASK; -@@ -1030,7 +1031,8 @@ inno_hdmi_phy_rk3328_power_on(struct inno_hdmi_phy *inno, - inno_write(inno, 0xab, RK3328_POST_PLL_FB_DIV_8(cfg->fbdiv) | - RK3328_POST_PLL_PRE_DIV(cfg->prediv)); - inno_write(inno, 0xaa, RK3328_POST_PLL_POST_DIV_ENABLE | -- RK3328_POST_PLL_REFCLK_SEL_TMDS); -+ RK3328_POST_PLL_REFCLK_SEL_TMDS | -+ RK3328_POST_PLL_POWER_DOWN); - } - - for (v = 0; v < 14; v++) - From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Huicong Xu Date: Sat, 10 Oct 2020 15:32:20 +0000 diff --git a/projects/Samsung/linux/linux.arm.conf b/projects/Samsung/linux/linux.arm.conf index 3f1eb18a31..b4e17e3030 100644 --- a/projects/Samsung/linux/linux.arm.conf +++ b/projects/Samsung/linux/linux.arm.conf @@ -1575,7 +1575,7 @@ CONFIG_SCSI_DMA=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set CONFIG_BLK_DEV_SR=y -# CONFIG_CHR_DEV_SG is not set +CONFIG_CHR_DEV_SG=m CONFIG_BLK_DEV_BSG=y # CONFIG_CHR_DEV_SCH is not set # CONFIG_SCSI_CONSTANTS is not set diff --git a/scripts/image b/scripts/image index e5a91214c6..a24f892022 100755 --- a/scripts/image +++ b/scripts/image @@ -509,6 +509,9 @@ if [ "${1}" = "release" -o "${1}" = "mkimage" -o "${1}" = "noobs" ]; then -e "s%@SYSTEM_SIZE@%${SYSTEM_SIZE}%g" \ -i ${RELEASE_DIR}/${NOOBS_DISTRO}/partitions.json + sed -e "s%@EXTRA_CMDLINE@%${EXTRA_CMDLINE}%g" \ + -i ${RELEASE_DIR}/${NOOBS_DISTRO}/partition_setup.sh + # Create System dir mkdir -p ${RELEASE_DIR}/${NOOBS_DISTRO}/System @@ -520,6 +523,12 @@ if [ "${1}" = "release" -o "${1}" = "mkimage" -o "${1}" = "noobs" ]; then fi done cp -PR ${INSTALL}/usr/share/bootloader/LICENCE* ${RELEASE_DIR}/${NOOBS_DISTRO}/System/ + for f in bootcode.bin fixup.dat start.elf ; do + if [ -f "${INSTALL}/usr/share/bootloader/$f" ]; then + cp -PR "${INSTALL}/usr/share/bootloader/$f" "${RELEASE_DIR}/${NOOBS_DISTRO}/System/" + fi + done + cp -PR ${INSTALL}/usr/share/bootloader/bootcode.bin ${RELEASE_DIR}/${NOOBS_DISTRO}/System/ cp -PR ${INSTALL}/usr/share/bootloader/fixup.dat ${RELEASE_DIR}/${NOOBS_DISTRO}/System/ cp -PR ${INSTALL}/usr/share/bootloader/start.elf ${RELEASE_DIR}/${NOOBS_DISTRO}/System/ diff --git a/scripts/mkimage b/scripts/mkimage index 0213f19019..ffb2c7c9b7 100755 --- a/scripts/mkimage +++ b/scripts/mkimage @@ -284,9 +284,12 @@ EOF mcopy "${RELEASE_DIR}/target/KERNEL.md5" "::/${KERNEL_NAME}.md5" >"${SAVE_ERROR}" 2>&1 || show_error mcopy "${RELEASE_DIR}/target/SYSTEM.md5" ::/SYSTEM.md5 >"${SAVE_ERROR}" 2>&1 || show_error - mcopy "${RELEASE_DIR}/3rdparty/bootloader/bootcode.bin" :: >"${SAVE_ERROR}" 2>&1 || show_error - mcopy "${RELEASE_DIR}/3rdparty/bootloader/fixup.dat" :: >"${SAVE_ERROR}" 2>&1 || show_error - mcopy "${RELEASE_DIR}/3rdparty/bootloader/start.elf" :: >"${SAVE_ERROR}" 2>&1 || show_error + for f in bootcode.bin fixup.dat start.elf ; do + if [ -f "${RELEASE_DIR}/3rdparty/bootloader/$f" ]; then + mcopy "${RELEASE_DIR}/3rdparty/bootloader/$f" :: >"${SAVE_ERROR}" 2>&1 || show_error + fi + done + mcopy "${RELEASE_DIR}/3rdparty/bootloader/config.txt" :: >"${SAVE_ERROR}" 2>&1 || show_error for distro in "${RELEASE_DIR}/3rdparty/bootloader/distroconfig"*.txt ; do if [ -f "${distro}" ]; then